diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
82 files changed, 13392 insertions, 7365 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp index 1eaccff..2794e60 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp @@ -14,7 +14,6 @@ #include "llvm/MC/MCTargetAsmLexer.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" using namespace llvm; @@ -144,11 +143,7 @@ AsmToken X86AsmLexer::LexTokenIntel() { SetError(Lexer->getErrLoc(), Lexer->getErr()); return lexedToken; case AsmToken::Identifier: { - std::string upperCase = lexedToken.getString().str(); - std::string lowerCase = LowercaseString(upperCase); - StringRef lowerRef(lowerCase); - - unsigned regID = MatchRegisterName(lowerRef); + unsigned regID = MatchRegisterName(lexedToken.getString().lower()); if (regID) return AsmToken(AsmToken::Register, diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index cb4f15f..08c732c 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -17,10 +17,8 @@ #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/SourceMgr.h" @@ -32,33 +30,47 @@ using namespace llvm; namespace { struct X86Operand; -class X86ATTAsmParser : public MCTargetAsmParser { +class X86AsmParser : public MCTargetAsmParser { MCSubtargetInfo &STI; MCAsmParser &Parser; - private: MCAsmParser &getParser() const { return Parser; } MCAsmLexer &getLexer() const { return Parser.getLexer(); } - bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + bool Error(SMLoc L, const Twine &Msg, + ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) { + return Parser.Error(L, Msg, Ranges); + } + + X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) { + Error(Loc, Msg); + return 0; + } X86Operand *ParseOperand(); + X86Operand *ParseATTOperand(); + X86Operand *ParseIntelOperand(); + X86Operand *ParseIntelMemOperand(); + X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size); X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); + bool processInstruction(MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Ops); + bool MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out); /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi) - /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode. + /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode. bool isSrcOp(X86Operand &Op); - /// isDstOp - Returns true if operand is either %es:(%rdi) in 64bit mode - /// or %es:(%edi) in 32bit mode. + /// isDstOp - Returns true if operand is either (%rdi) or %es:(%rdi) + /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode. bool isDstOp(X86Operand &Op); bool is64BitMode() const { @@ -79,7 +91,7 @@ private: /// } public: - X86ATTAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser) + X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser) : MCTargetAsmParser(), STI(sti), Parser(parser) { // Initialize the set of available features. @@ -91,6 +103,10 @@ public: SmallVectorImpl<MCParsedAsmOperand*> &Operands); virtual bool ParseDirective(AsmToken DirectiveID); + + bool isParsingIntelSyntax() { + return getParser().getAssemblerDialect(); + } }; } // end anonymous namespace @@ -101,6 +117,31 @@ static unsigned MatchRegisterName(StringRef Name); /// } +static bool isImmSExti16i8Value(uint64_t Value) { + return (( Value <= 0x000000000000007FULL)|| + (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} + +static bool isImmSExti32i8Value(uint64_t Value) { + return (( Value <= 0x000000000000007FULL)|| + (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} + +static bool isImmZExtu32u8Value(uint64_t Value) { + return (Value <= 0x00000000000000FFULL); +} + +static bool isImmSExti64i8Value(uint64_t Value) { + return (( Value <= 0x000000000000007FULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} + +static bool isImmSExti64i32Value(uint64_t Value) { + return (( Value <= 0x000000007FFFFFFFULL)|| + (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} namespace { /// X86Operand - Instances of this class represent a parsed X86 machine @@ -135,6 +176,7 @@ struct X86Operand : public MCParsedAsmOperand { unsigned BaseReg; unsigned IndexReg; unsigned Scale; + unsigned Size; } Mem; }; @@ -145,6 +187,8 @@ struct X86Operand : public MCParsedAsmOperand { SMLoc getStartLoc() const { return StartLoc; } /// getEndLoc - Get the location of the last token of this operand. SMLoc getEndLoc() const { return EndLoc; } + + SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } virtual void print(raw_ostream &OS) const {} @@ -205,10 +249,7 @@ struct X86Operand : public MCParsedAsmOperand { // Otherwise, check the value is in a range that makes sense for this // extension. - uint64_t Value = CE->getValue(); - return (( Value <= 0x000000000000007FULL)|| - (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isImmSExti16i8Value(CE->getValue()); } bool isImmSExti32i8() const { if (!isImm()) @@ -222,10 +263,7 @@ struct X86Operand : public MCParsedAsmOperand { // Otherwise, check the value is in a range that makes sense for this // extension. - uint64_t Value = CE->getValue(); - return (( Value <= 0x000000000000007FULL)|| - (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isImmSExti32i8Value(CE->getValue()); } bool isImmZExtu32u8() const { if (!isImm()) @@ -239,8 +277,7 @@ struct X86Operand : public MCParsedAsmOperand { // Otherwise, check the value is in a range that makes sense for this // extension. - uint64_t Value = CE->getValue(); - return (Value <= 0x00000000000000FFULL); + return isImmZExtu32u8Value(CE->getValue()); } bool isImmSExti64i8() const { if (!isImm()) @@ -254,9 +291,7 @@ struct X86Operand : public MCParsedAsmOperand { // Otherwise, check the value is in a range that makes sense for this // extension. - uint64_t Value = CE->getValue(); - return (( Value <= 0x000000000000007FULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isImmSExti64i8Value(CE->getValue()); } bool isImmSExti64i32() const { if (!isImm()) @@ -270,12 +305,31 @@ struct X86Operand : public MCParsedAsmOperand { // Otherwise, check the value is in a range that makes sense for this // extension. - uint64_t Value = CE->getValue(); - return (( Value <= 0x000000007FFFFFFFULL)|| - (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isImmSExti64i32Value(CE->getValue()); } bool isMem() const { return Kind == Memory; } + bool isMem8() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 8); + } + bool isMem16() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 16); + } + bool isMem32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32); + } + bool isMem64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64); + } + bool isMem80() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 80); + } + bool isMem128() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 128); + } + bool isMem256() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 256); + } bool isAbsMem() const { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && @@ -302,6 +356,28 @@ struct X86Operand : public MCParsedAsmOperand { addExpr(Inst, getImm()); } + void addMem8Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem16Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem32Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem64Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem80Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem128Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMem256Operands(MCInst &Inst, unsigned N) const { + addMemOperands(Inst, N); + } + void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); Inst.addOperand(MCOperand::CreateReg(getMemBaseReg())); @@ -313,11 +389,16 @@ struct X86Operand : public MCParsedAsmOperand { void addAbsMemOperands(MCInst &Inst, unsigned N) const { assert((N == 1) && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateExpr(getMemDisp())); + // Add as immediates when possible. + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp())) + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + else + Inst.addOperand(MCOperand::CreateExpr(getMemDisp())); } static X86Operand *CreateToken(StringRef Str, SMLoc Loc) { - X86Operand *Res = new X86Operand(Token, Loc, Loc); + SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size() - 1); + X86Operand *Res = new X86Operand(Token, Loc, EndLoc); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); return Res; @@ -337,20 +418,22 @@ struct X86Operand : public MCParsedAsmOperand { /// Create an absolute memory operand. static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, - SMLoc EndLoc) { + SMLoc EndLoc, unsigned Size = 0) { X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; Res->Mem.BaseReg = 0; Res->Mem.IndexReg = 0; Res->Mem.Scale = 1; + Res->Mem.Size = Size; return Res; } /// Create a generalized memory operand. static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, - unsigned Scale, SMLoc StartLoc, SMLoc EndLoc) { + unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); @@ -364,13 +447,14 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.BaseReg = BaseReg; Res->Mem.IndexReg = IndexReg; Res->Mem.Scale = Scale; + Res->Mem.Size = Size; return Res; } }; } // end anonymous namespace. -bool X86ATTAsmParser::isSrcOp(X86Operand &Op) { +bool X86AsmParser::isSrcOp(X86Operand &Op) { unsigned basereg = is64BitMode() ? X86::RSI : X86::ESI; return (Op.isMem() && @@ -380,32 +464,38 @@ bool X86ATTAsmParser::isSrcOp(X86Operand &Op) { Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0); } -bool X86ATTAsmParser::isDstOp(X86Operand &Op) { +bool X86AsmParser::isDstOp(X86Operand &Op) { unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI; - return Op.isMem() && Op.Mem.SegReg == X86::ES && + return Op.isMem() && + (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) && isa<MCConstantExpr>(Op.Mem.Disp) && cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0; } -bool X86ATTAsmParser::ParseRegister(unsigned &RegNo, - SMLoc &StartLoc, SMLoc &EndLoc) { +bool X86AsmParser::ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, SMLoc &EndLoc) { RegNo = 0; - const AsmToken &TokPercent = Parser.getTok(); - assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!"); - StartLoc = TokPercent.getLoc(); - Parser.Lex(); // Eat percent token. + if (!isParsingIntelSyntax()) { + const AsmToken &TokPercent = Parser.getTok(); + assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!"); + StartLoc = TokPercent.getLoc(); + Parser.Lex(); // Eat percent token. + } const AsmToken &Tok = Parser.getTok(); - if (Tok.isNot(AsmToken::Identifier)) - return Error(Tok.getLoc(), "invalid register name"); + if (Tok.isNot(AsmToken::Identifier)) { + if (isParsingIntelSyntax()) return true; + return Error(StartLoc, "invalid register name", + SMRange(StartLoc, Tok.getEndLoc())); + } RegNo = MatchRegisterName(Tok.getString()); // If the match failed, try the register name as lowercase. if (RegNo == 0) - RegNo = MatchRegisterName(LowercaseString(Tok.getString())); + RegNo = MatchRegisterName(Tok.getString().lower()); if (!is64BitMode()) { // FIXME: This should be done using Requires<In32BitMode> and @@ -417,8 +507,9 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo, X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) || X86II::isX86_64NonExtLowByteReg(RegNo) || X86II::isX86_64ExtendedReg(RegNo)) - return Error(Tok.getLoc(), "register %" - + Tok.getString() + " is only available in 64-bit mode"); + return Error(StartLoc, "register %" + + Tok.getString() + " is only available in 64-bit mode", + SMRange(StartLoc, Tok.getEndLoc())); } // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens. @@ -478,15 +569,182 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo, } } - if (RegNo == 0) - return Error(Tok.getLoc(), "invalid register name"); + if (RegNo == 0) { + if (isParsingIntelSyntax()) return true; + return Error(StartLoc, "invalid register name", + SMRange(StartLoc, Tok.getEndLoc())); + } - EndLoc = Tok.getLoc(); + EndLoc = Tok.getEndLoc(); Parser.Lex(); // Eat identifier token. return false; } -X86Operand *X86ATTAsmParser::ParseOperand() { +X86Operand *X86AsmParser::ParseOperand() { + if (isParsingIntelSyntax()) + return ParseIntelOperand(); + return ParseATTOperand(); +} + +/// getIntelMemOperandSize - Return intel memory operand size. +static unsigned getIntelMemOperandSize(StringRef OpStr) { + unsigned Size = 0; + if (OpStr == "BYTE") Size = 8; + if (OpStr == "WORD") Size = 16; + if (OpStr == "DWORD") Size = 32; + if (OpStr == "QWORD") Size = 64; + if (OpStr == "XWORD") Size = 80; + if (OpStr == "XMMWORD") Size = 128; + if (OpStr == "YMMWORD") Size = 256; + return Size; +} + +X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, + unsigned Size) { + unsigned BaseReg = 0, IndexReg = 0, Scale = 1; + SMLoc Start = Parser.getTok().getLoc(), End; + + const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); + // Parse [ BaseReg + Scale*IndexReg + Disp ] or [ symbol ] + + // Eat '[' + if (getLexer().isNot(AsmToken::LBrac)) + return ErrorOperand(Start, "Expected '[' token!"); + Parser.Lex(); + + if (getLexer().is(AsmToken::Identifier)) { + // Parse BaseReg + if (ParseRegister(BaseReg, Start, End)) { + // Handle '[' 'symbol' ']' + if (getParser().ParseExpression(Disp, End)) return 0; + if (getLexer().isNot(AsmToken::RBrac)) + return ErrorOperand(Start, "Expected ']' token!"); + Parser.Lex(); + return X86Operand::CreateMem(Disp, Start, End, Size); + } + } else if (getLexer().is(AsmToken::Integer)) { + int64_t Val = Parser.getTok().getIntVal(); + Parser.Lex(); + SMLoc Loc = Parser.getTok().getLoc(); + if (getLexer().is(AsmToken::RBrac)) { + // Handle '[' number ']' + Parser.Lex(); + const MCExpr *Disp = MCConstantExpr::Create(Val, getContext()); + if (SegReg) + return X86Operand::CreateMem(SegReg, Disp, 0, 0, Scale, + Start, End, Size); + return X86Operand::CreateMem(Disp, Start, End, Size); + } else if (getLexer().is(AsmToken::Star)) { + // Handle '[' Scale*IndexReg ']' + Parser.Lex(); + SMLoc IdxRegLoc = Parser.getTok().getLoc(); + if (ParseRegister(IndexReg, IdxRegLoc, End)) + return ErrorOperand(IdxRegLoc, "Expected register"); + Scale = Val; + } else + return ErrorOperand(Loc, "Unepxeted token"); + } + + if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) { + bool isPlus = getLexer().is(AsmToken::Plus); + Parser.Lex(); + SMLoc PlusLoc = Parser.getTok().getLoc(); + if (getLexer().is(AsmToken::Integer)) { + int64_t Val = Parser.getTok().getIntVal(); + Parser.Lex(); + if (getLexer().is(AsmToken::Star)) { + Parser.Lex(); + SMLoc IdxRegLoc = Parser.getTok().getLoc(); + if (ParseRegister(IndexReg, IdxRegLoc, End)) + return ErrorOperand(IdxRegLoc, "Expected register"); + Scale = Val; + } else if (getLexer().is(AsmToken::RBrac)) { + const MCExpr *ValExpr = MCConstantExpr::Create(Val, getContext()); + Disp = isPlus ? ValExpr : MCConstantExpr::Create(0-Val, getContext()); + } else + return ErrorOperand(PlusLoc, "unexpected token after +"); + } else if (getLexer().is(AsmToken::Identifier)) { + // This could be an index register or a displacement expression. + End = Parser.getTok().getLoc(); + if (!IndexReg) + ParseRegister(IndexReg, Start, End); + else if (getParser().ParseExpression(Disp, End)) return 0; + } + } + + if (getLexer().isNot(AsmToken::RBrac)) + if (getParser().ParseExpression(Disp, End)) return 0; + + End = Parser.getTok().getLoc(); + if (getLexer().isNot(AsmToken::RBrac)) + return ErrorOperand(End, "expected ']' token!"); + Parser.Lex(); + End = Parser.getTok().getLoc(); + + // handle [-42] + if (!BaseReg && !IndexReg) + return X86Operand::CreateMem(Disp, Start, End, Size); + + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, + Start, End, Size); +} + +/// ParseIntelMemOperand - Parse intel style memory operand. +X86Operand *X86AsmParser::ParseIntelMemOperand() { + const AsmToken &Tok = Parser.getTok(); + SMLoc Start = Parser.getTok().getLoc(), End; + unsigned SegReg = 0; + + unsigned Size = getIntelMemOperandSize(Tok.getString()); + if (Size) { + Parser.Lex(); + assert (Tok.getString() == "PTR" && "Unexpected token!"); + Parser.Lex(); + } + + if (getLexer().is(AsmToken::LBrac)) + return ParseIntelBracExpression(SegReg, Size); + + if (!ParseRegister(SegReg, Start, End)) { + // Handel SegReg : [ ... ] + if (getLexer().isNot(AsmToken::Colon)) + return ErrorOperand(Start, "Expected ':' token!"); + Parser.Lex(); // Eat : + if (getLexer().isNot(AsmToken::LBrac)) + return ErrorOperand(Start, "Expected '[' token!"); + return ParseIntelBracExpression(SegReg, Size); + } + + const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); + if (getParser().ParseExpression(Disp, End)) return 0; + return X86Operand::CreateMem(Disp, Start, End, Size); +} + +X86Operand *X86AsmParser::ParseIntelOperand() { + SMLoc Start = Parser.getTok().getLoc(), End; + + // immediate. + if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) || + getLexer().is(AsmToken::Minus)) { + const MCExpr *Val; + if (!getParser().ParseExpression(Val, End)) { + End = Parser.getTok().getLoc(); + return X86Operand::CreateImm(Val, Start, End); + } + } + + // register + unsigned RegNo = 0; + if (!ParseRegister(RegNo, Start, End)) { + End = Parser.getTok().getLoc(); + return X86Operand::CreateReg(RegNo, Start, End); + } + + // mem operand + return ParseIntelMemOperand(); +} + +X86Operand *X86AsmParser::ParseATTOperand() { switch (getLexer().getKind()) { default: // Parse a memory operand with no segment register. @@ -497,7 +755,8 @@ X86Operand *X86ATTAsmParser::ParseOperand() { SMLoc Start, End; if (ParseRegister(RegNo, Start, End)) return 0; if (RegNo == X86::EIZ || RegNo == X86::RIZ) { - Error(Start, "%eiz and %riz can only be used as index registers"); + Error(Start, "%eiz and %riz can only be used as index registers", + SMRange(Start, End)); return 0; } @@ -524,7 +783,7 @@ X86Operand *X86ATTAsmParser::ParseOperand() { /// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix /// has already been parsed if present. -X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { +X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { // We have to disambiguate a parenthesized expression "(4+5)" from the start // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The @@ -579,18 +838,21 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { // If we reached here, then we just ate the ( of the memory operand. Process // the rest of the memory operand. unsigned BaseReg = 0, IndexReg = 0, Scale = 1; + SMLoc IndexLoc; if (getLexer().is(AsmToken::Percent)) { - SMLoc L; - if (ParseRegister(BaseReg, L, L)) return 0; + SMLoc StartLoc, EndLoc; + if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0; if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) { - Error(L, "eiz and riz can only be used as index registers"); + Error(StartLoc, "eiz and riz can only be used as index registers", + SMRange(StartLoc, EndLoc)); return 0; } } if (getLexer().is(AsmToken::Comma)) { Parser.Lex(); // Eat the comma. + IndexLoc = Parser.getTok().getLoc(); // Following the comma we should have either an index register, or a scale // value. We don't support the later form, but we want to parse it @@ -616,8 +878,10 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { SMLoc Loc = Parser.getTok().getLoc(); int64_t ScaleVal; - if (getParser().ParseAbsoluteExpression(ScaleVal)) + if (getParser().ParseAbsoluteExpression(ScaleVal)){ + Error(Loc, "expected scale expression"); return 0; + } // Validate the scale amount. if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){ @@ -650,11 +914,28 @@ X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { SMLoc MemEnd = Parser.getTok().getLoc(); Parser.Lex(); // Eat the ')'. + // If we have both a base register and an index register make sure they are + // both 64-bit or 32-bit registers. + if (BaseReg != 0 && IndexReg != 0) { + if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) && + !X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) && + IndexReg != X86::RIZ) { + Error(IndexLoc, "index register is 32-bit, but base register is 64-bit"); + return 0; + } + if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) && + !X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) && + IndexReg != X86::EIZ){ + Error(IndexLoc, "index register is 64-bit, but base register is 32-bit"); + return 0; + } + } + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, MemStart, MemEnd); } -bool X86ATTAsmParser:: +bool X86AsmParser:: ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands) { StringRef PatchedName = Name; @@ -669,20 +950,21 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && (PatchedName.endswith("ss") || PatchedName.endswith("sd") || PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { - bool IsVCMP = PatchedName.startswith("vcmp"); + bool IsVCMP = PatchedName[0] == 'v'; unsigned SSECCIdx = IsVCMP ? 4 : 3; unsigned SSEComparisonCode = StringSwitch<unsigned>( PatchedName.slice(SSECCIdx, PatchedName.size() - 2)) - .Case("eq", 0) - .Case("lt", 1) - .Case("le", 2) - .Case("unord", 3) - .Case("neq", 4) - .Case("nlt", 5) - .Case("nle", 6) - .Case("ord", 7) - .Case("eq_uq", 8) - .Case("nge", 9) + .Case("eq", 0x00) + .Case("lt", 0x01) + .Case("le", 0x02) + .Case("unord", 0x03) + .Case("neq", 0x04) + .Case("nlt", 0x05) + .Case("nle", 0x06) + .Case("ord", 0x07) + /* AVX only from here */ + .Case("eq_uq", 0x08) + .Case("nge", 0x09) .Case("ngt", 0x0A) .Case("false", 0x0B) .Case("neq_oq", 0x0C) @@ -706,7 +988,7 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, .Case("gt_oq", 0x1E) .Case("true_us", 0x1F) .Default(~0U); - if (SSEComparisonCode != ~0U) { + if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) { ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode, getParser().getContext()); if (PatchedName.endswith("ss")) { @@ -724,10 +1006,9 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); - if (ExtraImmOp) + if (ExtraImmOp && !isParsingIntelSyntax()) Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); - // Determine whether this is an instruction prefix. bool isPrefix = Name == "lock" || Name == "rep" || @@ -781,6 +1062,9 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, else if (isPrefix && getLexer().is(AsmToken::Slash)) Parser.Lex(); // Consume the prefix separator Slash + if (ExtraImmOp && isParsingIntelSyntax()) + Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); + // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> // "outb %al, %dx". Out doesn't take a memory form, but this is a widely // documented form in various unofficial manuals, so a lot of code uses it. @@ -916,11 +1200,21 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, Name.startswith("rcl") || Name.startswith("rcr") || Name.startswith("rol") || Name.startswith("ror")) && Operands.size() == 3) { - X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); - if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && - cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { - delete Operands[1]; - Operands.erase(Operands.begin() + 1); + if (isParsingIntelSyntax()) { + // Intel syntax + X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { + delete Operands[2]; + Operands.pop_back(); + } + } else { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); + } } } @@ -939,7 +1233,246 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, return false; } -bool X86ATTAsmParser:: +bool X86AsmParser:: +processInstruction(MCInst &Inst, + const SmallVectorImpl<MCParsedAsmOperand*> &Ops) { + switch (Inst.getOpcode()) { + default: return false; + case X86::AND16i16: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti16i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::AND16ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::AND32i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti32i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::AND32ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::AND64i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti64i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::AND64ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::XOR16i16: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti16i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::XOR16ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::XOR32i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti32i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::XOR32ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::XOR64i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti64i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::XOR64ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::OR16i16: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti16i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::OR16ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::OR32i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti32i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::OR32ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::OR64i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti64i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::OR64ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::CMP16i16: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti16i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::CMP16ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::CMP32i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti32i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::CMP32ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::CMP64i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti64i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::CMP64ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::ADD16i16: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti16i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::ADD16ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::ADD32i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti32i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::ADD32ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::ADD64i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti64i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::ADD64ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::SUB16i16: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti16i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::SUB16ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::AX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::SUB32i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti32i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::SUB32ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::EAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + case X86::SUB64i32: { + if (!Inst.getOperand(0).isImm() || + !isImmSExti64i8Value(Inst.getOperand(0).getImm())) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::SUB64ri8); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(MCOperand::CreateReg(X86::RAX)); + TmpInst.addOperand(Inst.getOperand(0)); + Inst = TmpInst; + return true; + } + } +} + +bool X86AsmParser:: MatchAndEmitInstruction(SMLoc IDLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCStreamer &Out) { @@ -957,6 +1490,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, Op->getToken() == "fstenv" || Op->getToken() == "fclex") { MCInst Inst; Inst.setOpcode(X86::WAIT); + Inst.setLoc(IDLoc); Out.EmitInstruction(Inst); const char *Repl = @@ -980,9 +1514,17 @@ MatchAndEmitInstruction(SMLoc IDLoc, MCInst Inst; // First, try a direct match. - switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo)) { + switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo, + isParsingIntelSyntax())) { default: break; case Match_Success: + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. Loop on it while changes happen so the + // individual transformations can chain off each other. + while (processInstruction(Inst, Operands)) + ; + + Inst.setLoc(IDLoc); Out.EmitInstruction(Inst); return false; case Match_MissingFeature: @@ -1040,6 +1582,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, (Match1 == Match_Success) + (Match2 == Match_Success) + (Match3 == Match_Success) + (Match4 == Match_Success); if (NumSuccessfulMatches == 1) { + Inst.setLoc(IDLoc); Out.EmitInstruction(Inst); return false; } @@ -1078,21 +1621,24 @@ MatchAndEmitInstruction(SMLoc IDLoc, if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { if (!WasOriginallyInvalidOperand) { - Error(IDLoc, "invalid instruction mnemonic '" + Base + "'"); - return true; + return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", + Op->getLocRange()); } // Recover location info for the operand if we know which was the problem. - SMLoc ErrorLoc = IDLoc; if (OrigErrorInfo != ~0U) { if (OrigErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((X86Operand*)Operands[OrigErrorInfo])->getStartLoc(); - if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + X86Operand *Operand = (X86Operand*)Operands[OrigErrorInfo]; + if (Operand->getStartLoc().isValid()) { + SMRange OperandRange = Operand->getLocRange(); + return Error(Operand->getStartLoc(), "invalid operand for instruction", + OperandRange); + } } - return Error(ErrorLoc, "invalid operand for instruction"); + return Error(IDLoc, "invalid operand for instruction"); } // If one instruction matched with a missing feature, report this as a @@ -1112,24 +1658,34 @@ MatchAndEmitInstruction(SMLoc IDLoc, } // If all of these were an outright failure, report it in a useless way. - // FIXME: We should give nicer diagnostics about the exact failure. Error(IDLoc, "unknown use of instruction mnemonic without a size suffix"); return true; } -bool X86ATTAsmParser::ParseDirective(AsmToken DirectiveID) { +bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getIdentifier(); if (IDVal == ".word") return ParseDirectiveWord(2, DirectiveID.getLoc()); else if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); + else if (IDVal.startswith(".intel_syntax")) { + getParser().setAssemblerDialect(1); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + if(Parser.getTok().getString() == "noprefix") { + // FIXME : Handle noprefix + Parser.Lex(); + } else + return true; + } + return false; + } return true; } /// ParseDirectiveWord /// ::= .word [ expression (, expression)* ] -bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { +bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; @@ -1154,7 +1710,7 @@ bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { /// ParseDirectiveCode /// ::= .code32 | .code64 -bool X86ATTAsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { +bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { if (IDVal == ".code32") { Parser.Lex(); if (is64BitMode()) { @@ -1179,8 +1735,8 @@ extern "C" void LLVMInitializeX86AsmLexer(); // Force static initialization. extern "C" void LLVMInitializeX86AsmParser() { - RegisterMCAsmParser<X86ATTAsmParser> X(TheX86_32Target); - RegisterMCAsmParser<X86ATTAsmParser> Y(TheX86_64Target); + RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target); + RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target); LLVMInitializeX86AsmLexer(); } diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 3aacb20..8278bde 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -1,4 +1,4 @@ -//===- X86Disassembler.cpp - Disassembler for x86 and x86_64 ----*- C++ -*-===// +//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===// // // The LLVM Compiler Infrastructure // @@ -18,9 +18,11 @@ #include "X86DisassemblerDecoder.h" #include "llvm/MC/EDInstInfo.h" -#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MemoryObject.h" @@ -42,6 +44,11 @@ void x86DisassemblerDebug(const char *file, dbgs() << file << ":" << line << ": " << s; } +const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii) { + const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii); + return MII->getName(Opcode); +} + #define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s)); namespace llvm { @@ -65,17 +72,19 @@ extern Target TheX86_32Target, TheX86_64Target; } static bool translateInstruction(MCInst &target, - InternalInstruction &source); + InternalInstruction &source, + const MCDisassembler *Dis); -X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode) : - MCDisassembler(STI), - fMode(mode) { -} +X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, + DisassemblerMode mode, + const MCInstrInfo *MII) + : MCDisassembler(STI), MII(MII), fMode(mode) {} X86GenericDisassembler::~X86GenericDisassembler() { + delete MII; } -EDInstInfo *X86GenericDisassembler::getEDInfo() const { +const EDInstInfo *X86GenericDisassembler::getEDInfo() const { return instInfoX86; } @@ -116,6 +125,8 @@ X86GenericDisassembler::getInstruction(MCInst &instr, uint64_t address, raw_ostream &vStream, raw_ostream &cStream) const { + CommentStream = &cStream; + InternalInstruction internalInstr; dlog_t loggerFn = logger; @@ -127,6 +138,7 @@ X86GenericDisassembler::getInstruction(MCInst &instr, (void*)®ion, loggerFn, (void*)&vStream, + (void*)MII, address, fMode); @@ -136,7 +148,8 @@ X86GenericDisassembler::getInstruction(MCInst &instr, } else { size = internalInstr.length; - return (!translateInstruction(instr, internalInstr)) ? Success : Fail; + return (!translateInstruction(instr, internalInstr, this)) ? + Success : Fail; } } @@ -161,6 +174,140 @@ static void translateRegister(MCInst &mcInst, Reg reg) { mcInst.addOperand(MCOperand::CreateReg(llvmRegnum)); } +/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the +/// immediate Value in the MCInst. +/// +/// @param Value - The immediate Value, has had any PC adjustment made by +/// the caller. +/// @param isBranch - If the instruction is a branch instruction +/// @param Address - The starting address of the instruction +/// @param Offset - The byte offset to this immediate in the instruction +/// @param Width - The byte width of this immediate in the instruction +/// +/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was +/// called then that function is called to get any symbolic information for the +/// immediate in the instruction using the Address, Offset and Width. If that +/// returns non-zero then the symbolic information it returns is used to create +/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo() +/// returns zero and isBranch is true then a symbol look up for immediate Value +/// is done and if a symbol is found an MCExpr is created with that, else +/// an MCExpr with the immediate Value is created. This function returns true +/// if it adds an operand to the MCInst and false otherwise. +static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, + uint64_t Address, uint64_t Offset, + uint64_t Width, MCInst &MI, + const MCDisassembler *Dis) { + LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback(); + struct LLVMOpInfo1 SymbolicOp; + memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); + SymbolicOp.Value = Value; + void *DisInfo = Dis->getDisInfoBlock(); + + if (!getOpInfo || + !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) { + // Clear SymbolicOp.Value from above and also all other fields. + memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); + LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); + if (!SymbolLookUp) + return false; + uint64_t ReferenceType; + if (isBranch) + ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; + else + ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; + const char *ReferenceName; + const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address, + &ReferenceName); + if (Name) { + SymbolicOp.AddSymbol.Name = Name; + SymbolicOp.AddSymbol.Present = true; + } + // For branches always create an MCExpr so it gets printed as hex address. + else if (isBranch) { + SymbolicOp.Value = Value; + } + if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub) + (*Dis->CommentStream) << "symbol stub for: " << ReferenceName; + if (!Name && !isBranch) + return false; + } + + MCContext *Ctx = Dis->getMCContext(); + const MCExpr *Add = NULL; + if (SymbolicOp.AddSymbol.Present) { + if (SymbolicOp.AddSymbol.Name) { + StringRef Name(SymbolicOp.AddSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Add = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx); + } + } + + const MCExpr *Sub = NULL; + if (SymbolicOp.SubtractSymbol.Present) { + if (SymbolicOp.SubtractSymbol.Name) { + StringRef Name(SymbolicOp.SubtractSymbol.Name); + MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); + Sub = MCSymbolRefExpr::Create(Sym, *Ctx); + } else { + Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx); + } + } + + const MCExpr *Off = NULL; + if (SymbolicOp.Value != 0) + Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx); + + const MCExpr *Expr; + if (Sub) { + const MCExpr *LHS; + if (Add) + LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx); + else + LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx); + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx); + else + Expr = LHS; + } else if (Add) { + if (Off != 0) + Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx); + else + Expr = Add; + } else { + if (Off != 0) + Expr = Off; + else + Expr = MCConstantExpr::Create(0, *Ctx); + } + + MI.addOperand(MCOperand::CreateExpr(Expr)); + + return true; +} + +/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being +/// referenced by a load instruction with the base register that is the rip. +/// These can often be addresses in a literal pool. The Address of the +/// instruction and its immediate Value are used to determine the address +/// being referenced in the literal pool entry. The SymbolLookUp call back will +/// return a pointer to a literal 'C' string if the referenced address is an +/// address into a section with 'C' string literals. +static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value, + const void *Decoder) { + const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); + LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); + if (SymbolLookUp) { + void *DisInfo = Dis->getDisInfoBlock(); + uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load; + const char *ReferenceName; + (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName); + if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) + (*Dis->CommentStream) << "literal pool for: " << ReferenceName; + } +} + /// translateImmediate - Appends an immediate operand to an MCInst. /// /// @param mcInst - The MCInst to append to. @@ -169,10 +316,11 @@ static void translateRegister(MCInst &mcInst, Reg reg) { /// @param insn - The internal instruction. static void translateImmediate(MCInst &mcInst, uint64_t immediate, const OperandSpecifier &operand, - InternalInstruction &insn) { + InternalInstruction &insn, + const MCDisassembler *Dis) { // Sign-extend the immediate if necessary. - OperandType type = operand.type; + OperandType type = (OperandType)operand.type; if (type == TYPE_RELv) { switch (insn.displacementSize) { @@ -225,6 +373,8 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, } } + bool isBranch = false; + uint64_t pcrel = 0; switch (type) { case TYPE_XMM128: mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4))); @@ -232,8 +382,11 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case TYPE_XMM256: mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4))); return; - case TYPE_MOFFS8: case TYPE_REL8: + isBranch = true; + pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; + // fall through to sign extend the immediate if needed. + case TYPE_MOFFS8: if(immediate & 0x80) immediate |= ~(0xffull); break; @@ -241,9 +394,12 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, if(immediate & 0x8000) immediate |= ~(0xffffull); break; - case TYPE_MOFFS32: case TYPE_REL32: case TYPE_REL64: + isBranch = true; + pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; + // fall through to sign extend the immediate if needed. + case TYPE_MOFFS32: if(immediate & 0x80000000) immediate |= ~(0xffffffffull); break; @@ -253,7 +409,10 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, break; } - mcInst.addOperand(MCOperand::CreateImm(immediate)); + if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation, + insn.immediateOffset, insn.immediateSize, + mcInst, Dis)) + mcInst.addOperand(MCOperand::CreateImm(immediate)); } /// translateRMRegister - Translates a register stored in the R/M field of the @@ -300,7 +459,8 @@ static bool translateRMRegister(MCInst &mcInst, /// @param insn - The instruction to extract Mod, R/M, and SIB fields /// from. /// @return - 0 on success; nonzero otherwise -static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) { +static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, + const MCDisassembler *Dis) { // Addresses in an MCInst are represented as five operands: // 1. basereg (register) The R/M base, or (if there is a SIB) the // SIB base @@ -318,6 +478,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) { MCOperand indexReg; MCOperand displacement; MCOperand segmentReg; + uint64_t pcrel = 0; if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { if (insn.sibBase != SIB_BASE_NONE) { @@ -359,8 +520,14 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) { debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base"); return true; } - if (insn.mode == MODE_64BIT) + if (insn.mode == MODE_64BIT){ + pcrel = insn.startLocation + + insn.displacementOffset + insn.displacementSize; + tryAddingPcLoadReferenceComment(insn.startLocation + + insn.displacementOffset, + insn.displacement + pcrel, Dis); baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6 + } else baseReg = MCOperand::CreateReg(0); @@ -426,7 +593,10 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) { mcInst.addOperand(baseReg); mcInst.addOperand(scaleAmount); mcInst.addOperand(indexReg); - mcInst.addOperand(displacement); + if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false, + insn.startLocation, insn.displacementOffset, + insn.displacementSize, mcInst, Dis)) + mcInst.addOperand(displacement); mcInst.addOperand(segmentReg); return false; } @@ -440,7 +610,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) { /// from. /// @return - 0 on success; nonzero otherwise static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, - InternalInstruction &insn) { + InternalInstruction &insn, const MCDisassembler *Dis) { switch (operand.type) { default: debug("Unexpected type for a R/M operand"); @@ -480,7 +650,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_M1632: case TYPE_M1664: case TYPE_LEA: - return translateRMMemory(mcInst, insn); + return translateRMMemory(mcInst, insn, Dis); } } @@ -510,7 +680,8 @@ static bool translateFPRegister(MCInst &mcInst, /// @param insn - The internal instruction. /// @return - false on success; true otherwise. static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, - InternalInstruction &insn) { + InternalInstruction &insn, + const MCDisassembler *Dis) { switch (operand.encoding) { default: debug("Unhandled operand encoding during translation"); @@ -519,7 +690,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, translateRegister(mcInst, insn.reg); return false; case ENCODING_RM: - return translateRM(mcInst, operand, insn); + return translateRM(mcInst, operand, insn, Dis); case ENCODING_CB: case ENCODING_CW: case ENCODING_CD: @@ -537,7 +708,8 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, translateImmediate(mcInst, insn.immediates[insn.numImmediatesTranslated++], operand, - insn); + insn, + Dis); return false; case ENCODING_RB: case ENCODING_RW: @@ -556,7 +728,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, case ENCODING_DUP: return translateOperand(mcInst, insn.spec->operands[operand.type - TYPE_DUP0], - insn); + insn, Dis); } } @@ -567,7 +739,8 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, /// @param insn - The internal instruction. /// @return - false on success; true otherwise. static bool translateInstruction(MCInst &mcInst, - InternalInstruction &insn) { + InternalInstruction &insn, + const MCDisassembler *Dis) { if (!insn.spec) { debug("Instruction has no specification"); return true; @@ -581,7 +754,7 @@ static bool translateInstruction(MCInst &mcInst, for (index = 0; index < X86_MAX_OPERANDS; ++index) { if (insn.spec->operands[index].encoding != ENCODING_NONE) { - if (translateOperand(mcInst, insn.spec->operands[index], insn)) { + if (translateOperand(mcInst, insn.spec->operands[index], insn, Dis)) { return true; } } @@ -590,12 +763,16 @@ static bool translateInstruction(MCInst &mcInst, return false; } -static MCDisassembler *createX86_32Disassembler(const Target &T, const MCSubtargetInfo &STI) { - return new X86Disassembler::X86_32Disassembler(STI); +static MCDisassembler *createX86_32Disassembler(const Target &T, + const MCSubtargetInfo &STI) { + return new X86Disassembler::X86GenericDisassembler(STI, MODE_32BIT, + T.createMCInstrInfo()); } -static MCDisassembler *createX86_64Disassembler(const Target &T, const MCSubtargetInfo &STI) { - return new X86Disassembler::X86_64Disassembler(STI); +static MCDisassembler *createX86_64Disassembler(const Target &T, + const MCSubtargetInfo &STI) { + return new X86Disassembler::X86GenericDisassembler(STI, MODE_64BIT, + T.createMCInstrInfo()); } extern "C" void LLVMInitializeX86Disassembler() { diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h index 6ac9a0f..c11f51c 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h @@ -1,4 +1,4 @@ -//===- X86Disassembler.h - Disassembler for x86 and x86_64 ------*- C++ -*-===// +//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -78,7 +78,7 @@ const char* name; #define INSTRUCTION_IDS \ - const InstrUID *instructionIDs; + unsigned instructionIDs; #include "X86DisassemblerDecoderCommon.h" @@ -87,11 +87,10 @@ #include "llvm/MC/MCDisassembler.h" -struct InternalInstruction; - namespace llvm { class MCInst; +class MCInstrInfo; class MCSubtargetInfo; class MemoryObject; class raw_ostream; @@ -104,13 +103,16 @@ namespace X86Disassembler { /// All each platform class should have to do is subclass the constructor, and /// provide a different disassemblerMode value. class X86GenericDisassembler : public MCDisassembler { -protected: + const MCInstrInfo *MII; +public: /// Constructor - Initializes the disassembler. /// /// @param mode - The X86 architecture mode to decode for. - X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode); -public: + X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode, + const MCInstrInfo *MII); +private: ~X86GenericDisassembler(); +public: /// getInstruction - See MCDisassembler. DecodeStatus getInstruction(MCInst &instr, @@ -121,37 +123,13 @@ public: raw_ostream &cStream) const; /// getEDInfo - See MCDisassembler. - EDInstInfo *getEDInfo() const; + const EDInstInfo *getEDInfo() const; private: DisassemblerMode fMode; }; -/// X86_16Disassembler - 16-bit X86 disassembler. -class X86_16Disassembler : public X86GenericDisassembler { -public: - X86_16Disassembler(const MCSubtargetInfo &STI) : - X86GenericDisassembler(STI, MODE_16BIT) { - } -}; - -/// X86_16Disassembler - 32-bit X86 disassembler. -class X86_32Disassembler : public X86GenericDisassembler { -public: - X86_32Disassembler(const MCSubtargetInfo &STI) : - X86GenericDisassembler(STI, MODE_32BIT) { - } -}; - -/// X86_16Disassembler - 64-bit X86 disassembler. -class X86_64Disassembler : public X86GenericDisassembler { -public: - X86_64Disassembler(const MCSubtargetInfo &STI) : - X86GenericDisassembler(STI, MODE_64BIT) { - } -}; - } // namespace X86Disassembler - + } // namespace llvm - + #endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index f9b0fe5..6020877 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -1,4 +1,4 @@ -/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==* +/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===* * * The LLVM Compiler Infrastructure * @@ -82,11 +82,9 @@ static int modRMRequired(OpcodeType type, decision = &THREEBYTEA7_SYM; break; } - + return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. modrm_type != MODRM_ONEENTRY; - - return 0; } /* @@ -103,12 +101,9 @@ static InstrUID decode(OpcodeType type, InstructionContext insnContext, uint8_t opcode, uint8_t modRM) { - const struct ModRMDecision* dec; + const struct ModRMDecision* dec = 0; switch (type) { - default: - debug("Unknown opcode type"); - return 0; case ONEBYTE: dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; break; @@ -134,14 +129,17 @@ static InstrUID decode(OpcodeType type, debug("Corrupt table! Unknown modrm_type"); return 0; case MODRM_ONEENTRY: - return dec->instructionIDs[0]; + return modRMTable[dec->instructionIDs]; case MODRM_SPLITRM: if (modFromModRM(modRM) == 0x3) - return dec->instructionIDs[1]; - else - return dec->instructionIDs[0]; + return modRMTable[dec->instructionIDs+1]; + return modRMTable[dec->instructionIDs]; + case MODRM_SPLITREG: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8]; + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; case MODRM_FULL: - return dec->instructionIDs[modRM]; + return modRMTable[dec->instructionIDs+modRM]; } } @@ -314,6 +312,15 @@ static int readPrefixes(struct InternalInstruction* insn) { if (consumeByte(insn, &byte)) return -1; + + /* + * If the first byte is a LOCK prefix break and let it be disassembled + * as a lock "instruction", by creating an <MCInst #xxxx LOCK_PREFIX>. + * FIXME there is currently no way to get the disassembler to print the + * lock prefix if it is not the first byte. + */ + if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) + break; switch (byte) { case 0xf0: /* LOCK */ @@ -712,7 +719,7 @@ static BOOL is16BitEquvalent(const char* orig, const char* equiv) { * @return - 0 if the ModR/M could be read when needed or was not needed; * nonzero otherwise. */ -static int getID(struct InternalInstruction* insn) { +static int getID(struct InternalInstruction* insn, void *miiArg) { uint8_t attrMask; uint16_t instructionID; @@ -765,6 +772,8 @@ static int getID(struct InternalInstruction* insn) { else { if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) attrMask |= ATTR_OPSIZE; + else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation)) + attrMask |= ATTR_ADSIZE; else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) attrMask |= ATTR_XS; else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) @@ -773,17 +782,20 @@ static int getID(struct InternalInstruction* insn) { if (insn->rexPrefix & 0x08) attrMask |= ATTR_REXW; - + if (getIDWithAttrMask(&instructionID, insn, attrMask)) return -1; - + /* The following clauses compensate for limitations of the tables. */ - - if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW)) { + + if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW) && + !(attrMask & ATTR_OPSIZE)) { /* * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit * has precedence since there are no L-bit with W-bit entries in the tables. * So if the L-bit isn't significant we should use the W-bit instead. + * We only need to do this if the instruction doesn't specify OpSize since + * there is a VEX_L_W_OPSIZE table. */ const struct InstructionSpecifier *spec; @@ -823,7 +835,7 @@ static int getID(struct InternalInstruction* insn) { const struct InstructionSpecifier *spec; uint16_t instructionIDWithOpsize; - const struct InstructionSpecifier *specWithOpsize; + const char *specName, *specWithOpSizeName; spec = specifierForUID(instructionID); @@ -840,11 +852,13 @@ static int getID(struct InternalInstruction* insn) { return 0; } - specWithOpsize = specifierForUID(instructionIDWithOpsize); - - if (is16BitEquvalent(spec->name, specWithOpsize->name)) { + specName = x86DisassemblerGetInstrName(instructionID, miiArg); + specWithOpSizeName = + x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg); + + if (is16BitEquvalent(specName, specWithOpSizeName)) { insn->instructionID = instructionIDWithOpsize; - insn->spec = specWithOpsize; + insn->spec = specifierForUID(instructionIDWithOpsize); } else { insn->instructionID = instructionID; insn->spec = spec; @@ -1011,6 +1025,7 @@ static int readDisplacement(struct InternalInstruction* insn) { return 0; insn->consumedDisplacement = TRUE; + insn->displacementOffset = insn->readerCursor - insn->startLocation; switch (insn->eaDisplacement) { case EA_DISP_NONE: @@ -1407,6 +1422,7 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) { size = insn->immediateSize; else insn->immediateSize = size; + insn->immediateOffset = insn->readerCursor - insn->startLocation; switch (size) { case 1: @@ -1469,6 +1485,7 @@ static int readVVVV(struct InternalInstruction* insn) { static int readOperands(struct InternalInstruction* insn) { int index; int hasVVVV, needVVVV; + int sawRegImm = 0; dbgprintf(insn, "readOperands()"); @@ -1497,11 +1514,25 @@ static int readOperands(struct InternalInstruction* insn) { dbgprintf(insn, "We currently don't hande code-offset encodings"); return -1; case ENCODING_IB: + if (sawRegImm) { + /* Saw a register immediate so don't read again and instead split the + previous immediate. FIXME: This is a hack. */ + insn->immediates[insn->numImmediatesConsumed] = + insn->immediates[insn->numImmediatesConsumed - 1] & 0xf; + ++insn->numImmediatesConsumed; + break; + } if (readImmediate(insn, 1)) return -1; if (insn->spec->operands[index].type == TYPE_IMM3 && insn->immediates[insn->numImmediatesConsumed - 1] > 7) return -1; + if (insn->spec->operands[index].type == TYPE_IMM5 && + insn->immediates[insn->numImmediatesConsumed - 1] > 31) + return -1; + if (insn->spec->operands[index].type == TYPE_XMM128 || + insn->spec->operands[index].type == TYPE_XMM256) + sawRegImm = 1; break; case ENCODING_IW: if (readImmediate(insn, 2)) @@ -1593,6 +1624,7 @@ int decodeInstruction(struct InternalInstruction* insn, void* readerArg, dlog_t logger, void* loggerArg, + void* miiArg, uint64_t startLoc, DisassemblerMode mode) { memset(insn, 0, sizeof(struct InternalInstruction)); @@ -1608,7 +1640,7 @@ int decodeInstruction(struct InternalInstruction* insn, if (readPrefixes(insn) || readOpcode(insn) || - getID(insn) || + getID(insn, miiArg) || insn->instructionID == 0 || readOperands(insn)) return -1; diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index a9c90f8..fae309b 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -1,4 +1,4 @@ -/*===- X86DisassemblerDecoderInternal.h - Disassembler decoder -----*- C -*-==* +/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===* * * The LLVM Compiler Infrastructure * @@ -20,11 +20,10 @@ extern "C" { #endif -#define INSTRUCTION_SPECIFIER_FIELDS \ - const char* name; +#define INSTRUCTION_SPECIFIER_FIELDS #define INSTRUCTION_IDS \ - const InstrUID *instructionIDs; + unsigned instructionIDs; #include "X86DisassemblerDecoderCommon.h" @@ -460,6 +459,11 @@ struct InternalInstruction { uint8_t addressSize; uint8_t displacementSize; uint8_t immediateSize; + + /* Offsets from the start of the instruction to the pieces of data, which is + needed to find relocation entries for adding symbolic operands */ + uint8_t displacementOffset; + uint8_t immediateOffset; /* opcode state */ @@ -554,6 +558,7 @@ int decodeInstruction(struct InternalInstruction* insn, void* readerArg, dlog_t logger, void* loggerArg, + void* miiArg, uint64_t startLoc, DisassemblerMode mode); @@ -568,6 +573,8 @@ void x86DisassemblerDebug(const char *file, unsigned line, const char *s); +const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii); + #ifdef __cplusplus } #endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 8b79335..13e1136 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -1,4 +1,4 @@ -/*===- X86DisassemblerDecoderCommon.h - Disassembler decoder -------*- C -*-==* +/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===* * * The LLVM Compiler Infrastructure * @@ -54,8 +54,9 @@ ENUM_ENTRY(ATTR_XD, 0x04) \ ENUM_ENTRY(ATTR_REXW, 0x08) \ ENUM_ENTRY(ATTR_OPSIZE, 0x10) \ - ENUM_ENTRY(ATTR_VEX, 0x20) \ - ENUM_ENTRY(ATTR_VEXL, 0x40) + ENUM_ENTRY(ATTR_ADSIZE, 0x20) \ + ENUM_ENTRY(ATTR_VEX, 0x40) \ + ENUM_ENTRY(ATTR_VEXL, 0x80) #define ENUM_ENTRY(n, v) n = v, enum attributeBits { @@ -77,6 +78,8 @@ enum attributeBits { "64-bit mode but no more") \ ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \ "operands change width") \ + ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \ + "operands change width") \ ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \ "but not the operands") \ ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \ @@ -88,6 +91,7 @@ enum attributeBits { ENUM_ENTRY(IC_64BIT_REXW, 4, "requires a REX.W prefix, so operands "\ "change width; overrides IC_OPSIZE") \ ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \ ENUM_ENTRY(IC_64BIT_XD, 5, "XD instructions are SSE; REX.W is " \ "secondary") \ ENUM_ENTRY(IC_64BIT_XS, 5, "Just as meaningful as IC_64BIT_XD") \ @@ -111,7 +115,8 @@ enum attributeBits { ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \ ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\ ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\ - ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") + ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \ + ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") #define ENUM_ENTRY(n, r, d) n, @@ -155,6 +160,8 @@ typedef uint16_t InstrUID; * MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode * corresponds to one instruction; otherwise, it corresponds to * a different instruction. + * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This + corresponds to instructions that use reg field as opcode * MODRM_FULL - Potentially, each value of the ModR/M byte could correspond * to a different instruction. */ @@ -162,6 +169,7 @@ typedef uint16_t InstrUID; #define MODRMTYPES \ ENUM_ENTRY(MODRM_ONEENTRY) \ ENUM_ENTRY(MODRM_SPLITRM) \ + ENUM_ENTRY(MODRM_SPLITREG) \ ENUM_ENTRY(MODRM_FULL) #define ENUM_ENTRY(n) n, @@ -265,6 +273,7 @@ struct ContextDecision { ENUM_ENTRY(TYPE_IMM32, "4-byte") \ ENUM_ENTRY(TYPE_IMM64, "8-byte") \ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \ + ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \ ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \ ENUM_ENTRY(TYPE_RM16, "2-byte") \ ENUM_ENTRY(TYPE_RM32, "4-byte") \ @@ -335,8 +344,8 @@ typedef enum { * operand. */ struct OperandSpecifier { - OperandEncoding encoding; - OperandType type; + uint8_t encoding; + uint8_t type; }; /* @@ -363,7 +372,7 @@ typedef enum { * its operands. */ struct InstructionSpecifier { - ModifierType modifierType; + uint8_t modifierType; uint8_t modifierBase; struct OperandSpecifier operands[X86_MAX_OPERANDS]; diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 029d491..5118e4c 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -19,6 +19,8 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" @@ -26,14 +28,9 @@ using namespace llvm; // Include the auto-generated portion of the assembly writer. -#define GET_INSTRUCTION_NAME #define PRINT_ALIAS_INSTR #include "X86GenAsmWriter.inc" -X86ATTInstPrinter::X86ATTInstPrinter(const MCAsmInfo &MAI) - : MCInstPrinter(MAI) { -} - void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { OS << '%' << getRegisterName(RegNo); @@ -45,29 +42,50 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, if (!printAliasInstr(MI, OS)) printInstruction(MI, OS); + // Next always print the annotation. + printAnnotation(OS, Annot); + // If verbose assembly is enabled, we can print some informative comments. - if (CommentStream) { - printAnnotation(OS, Annot); + if (CommentStream) EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); - } -} - -StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const { - return getInstructionName(Opcode); } void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O) { switch (MI->getOperand(Op).getImm()) { - default: assert(0 && "Invalid ssecc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; + default: llvm_unreachable("Invalid ssecc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; } } @@ -79,11 +97,21 @@ void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) - // Print this as a signed 32-bit value. - O << (int)Op.getImm(); + O << Op.getImm(); else { assert(Op.isExpr() && "unknown pcrel immediate operand"); - O << *Op.getExpr(); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { + O << "0x"; + O.write_hex(Address); + } + else { + // Otherwise, just print the expression. + O << *Op.getExpr(); + } } } @@ -97,7 +125,7 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, O << '$' << (int64_t)Op.getImm(); if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) - *CommentStream << format("imm = 0x%llX\n", (long long)Op.getImm()); + *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 0293869..2e00bff 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -1,4 +1,4 @@ -//===-- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -------===// +//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -22,11 +22,12 @@ class MCOperand; class X86ATTInstPrinter : public MCInstPrinter { public: - X86ATTInstPrinter(const MCAsmInfo &MAI); - + X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot); - virtual StringRef getOpcodeName(unsigned Opcode) const; // Autogenerated by tblgen, returns true if we successfully printed an // alias. @@ -35,7 +36,6 @@ public: // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &OS); static const char *getRegisterName(unsigned RegNo); - static const char *getInstructionName(unsigned Opcode); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp index 8d85b95..f532019 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -29,11 +29,17 @@ using namespace llvm; void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, const char *(*getRegName)(unsigned)) { // If this is a shuffle operation, the switch should fill in this state. - SmallVector<unsigned, 8> ShuffleMask; + SmallVector<int, 8> ShuffleMask; const char *DestName = 0, *Src1Name = 0, *Src2Name = 0; switch (MI->getOpcode()) { case X86::INSERTPSrr: + Src1Name = getRegName(MI->getOperand(0).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask); + break; + case X86::VINSERTPSrr: + DestName = getRegName(MI->getOperand(0).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); Src2Name = getRegName(MI->getOperand(2).getReg()); DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask); @@ -44,34 +50,61 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(0).getReg()); DecodeMOVLHPSMask(2, ShuffleMask); break; + case X86::VMOVLHPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVLHPSMask(2, ShuffleMask); + break; case X86::MOVHLPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(0).getReg()); DecodeMOVHLPSMask(2, ShuffleMask); break; + case X86::VMOVHLPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVHLPSMask(2, ShuffleMask); + break; case X86::PSHUFDri: + case X86::VPSHUFDri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFDmi: + case X86::VPSHUFDmi: DestName = getRegName(MI->getOperand(0).getReg()); - DecodePSHUFMask(4, MI->getOperand(MI->getNumOperands()-1).getImm(), + DecodePSHUFMask(MVT::v4i32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::VPSHUFDYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPSHUFDYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFMask(MVT::v8i32, MI->getOperand(MI->getNumOperands()-1).getImm(), ShuffleMask); break; + case X86::PSHUFHWri: + case X86::VPSHUFHWri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFHWmi: + case X86::VPSHUFHWmi: DestName = getRegName(MI->getOperand(0).getReg()); DecodePSHUFHWMask(MI->getOperand(MI->getNumOperands()-1).getImm(), ShuffleMask); break; case X86::PSHUFLWri: + case X86::VPSHUFLWri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFLWmi: + case X86::VPSHUFLWmi: DestName = getRegName(MI->getOperand(0).getReg()); DecodePSHUFLWMask(MI->getOperand(MI->getNumOperands()-1).getImm(), ShuffleMask); @@ -82,28 +115,92 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::PUNPCKHBWrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKHMask(16, ShuffleMask); + DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKHBWrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHBWrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKHBWYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHBWYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v32i8, ShuffleMask); break; case X86::PUNPCKHWDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKHWDrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKHMask(8, ShuffleMask); + DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKHWDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHWDrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKHWDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHWDYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v16i16, ShuffleMask); break; case X86::PUNPCKHDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKHDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKHMask(4, ShuffleMask); + DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKHDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKHDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v8i32, ShuffleMask); break; case X86::PUNPCKHQDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKHQDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKHMask(2, ShuffleMask); + DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKHQDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHQDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKHQDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHQDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v4i64, ShuffleMask); break; case X86::PUNPCKLBWrr: @@ -111,126 +208,284 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::PUNPCKLBWrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLBWMask(16, ShuffleMask); + DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKLBWrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLBWrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); + break; + case X86::VPUNPCKLBWYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLBWYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v32i8, ShuffleMask); break; case X86::PUNPCKLWDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLWDrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLWDMask(8, ShuffleMask); + DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKLWDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLWDrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); + break; + case X86::VPUNPCKLWDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLWDYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v16i16, ShuffleMask); break; case X86::PUNPCKLDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLDQMask(4, ShuffleMask); + DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKLDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); + break; + case X86::VPUNPCKLDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v8i32, ShuffleMask); break; case X86::PUNPCKLQDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLQDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLQDQMask(2, ShuffleMask); + DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKLQDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLQDQrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); + break; + case X86::VPUNPCKLQDQYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLQDQYrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v4i64, ShuffleMask); break; case X86::SHUFPDrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::SHUFPDrmi: - DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask); + DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VSHUFPDrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VSHUFPDrmi: + DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VSHUFPDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VSHUFPDYrmi: + DecodeSHUFPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::SHUFPSrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::SHUFPSrmi: - DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask); + DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VSHUFPSrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VSHUFPSrmi: + DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VSHUFPSYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VSHUFPSYrmi: + DecodeSHUFPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::UNPCKLPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKLPDrm: - DecodeUNPCKLPDMask(2, ShuffleMask); + DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; case X86::VUNPCKLPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKLPDrm: - DecodeUNPCKLPDMask(2, ShuffleMask); + DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VUNPCKLPDYrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKLPDYrm: - DecodeUNPCKLPDMask(4, ShuffleMask); + DecodeUNPCKLMask(MVT::v4f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::UNPCKLPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKLPSrm: - DecodeUNPCKLPSMask(4, ShuffleMask); + DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; case X86::VUNPCKLPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKLPSrm: - DecodeUNPCKLPSMask(4, ShuffleMask); + DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VUNPCKLPSYrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VUNPCKLPSYrm: - DecodeUNPCKLPSMask(8, ShuffleMask); + DecodeUNPCKLMask(MVT::v8f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::UNPCKHPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKHPDrm: - DecodeUNPCKHPMask(2, ShuffleMask); + DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKHPDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKHPDrm: + DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VUNPCKHPDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKHPDYrm: + DecodeUNPCKHMask(MVT::v4f64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::UNPCKHPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKHPSrm: - DecodeUNPCKHPMask(4, ShuffleMask); + DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKHPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKHPSrm: + DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VUNPCKHPSYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKHPSYrm: + DecodeUNPCKHMask(MVT::v8f32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::VPERMILPSri: - DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPSmi: + DecodePSHUFMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERMILPSYri: - DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPSYmi: + DecodePSHUFMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERMILPDri: - DecodeVPERMILPDMask(2, MI->getOperand(2).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPDmi: + DecodePSHUFMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERMILPDYri: - DecodeVPERMILPDMask(4, MI->getOperand(2).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMILPDYmi: + DecodePSHUFMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); break; case X86::VPERM2F128rr: - DecodeVPERM2F128Mask(MI->getOperand(3).getImm(), ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); + case X86::VPERM2I128rr: Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPERM2F128rm: + case X86::VPERM2I128rm: + // For instruction comments purpose, assume the 256-bit vector is v4i64. + DecodeVPERM2X128Mask(MVT::v4i64, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); break; } @@ -245,7 +500,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, if (Src1Name == Src2Name) { for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { if ((int)ShuffleMask[i] >= 0 && // Not sentinel. - ShuffleMask[i] >= e) // From second mask. + ShuffleMask[i] >= (int)e) // From second mask. ShuffleMask[i] -= e; } } @@ -263,13 +518,13 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // Otherwise, it must come from src1 or src2. Print the span of elements // that comes from this src. - bool isSrc1 = ShuffleMask[i] < ShuffleMask.size(); + bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); const char *SrcName = isSrc1 ? Src1Name : Src2Name; OS << (SrcName ? SrcName : "mem") << '['; bool IsFirst = true; while (i != e && (int)ShuffleMask[i] >= 0 && - (ShuffleMask[i] < ShuffleMask.size()) == isSrc1) { + (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { if (!IsFirst) OS << ','; else diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h index 6b86db4..13fdf9a 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h @@ -1,4 +1,4 @@ -//===-- X86InstComments.h - Generate verbose-asm comments for instrs ------===// +//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index f9ab5ae..4ea662c 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -17,15 +17,13 @@ #include "X86InstComments.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include <cctype> using namespace llvm; -// Include the auto-generated portion of the assembly writer. -#define GET_INSTRUCTION_NAME #include "X86GenAsmWriter1.inc" void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { @@ -35,29 +33,52 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) { printInstruction(MI, OS); - + + // Next always print the annotation. + printAnnotation(OS, Annot); + // If verbose assembly is enabled, we can print some informative comments. - if (CommentStream) { - printAnnotation(OS, Annot); + if (CommentStream) EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); - } -} -StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const { - return getInstructionName(Opcode); } void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O) { switch (MI->getOperand(Op).getImm()) { - default: assert(0 && "Invalid ssecc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; + default: llvm_unreachable("Invalid ssecc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; + } } @@ -70,7 +91,18 @@ void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo, O << Op.getImm(); else { assert(Op.isExpr() && "unknown pcrel immediate operand"); - O << *Op.getExpr(); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { + O << "0x"; + O.write_hex(Address); + } + else { + // Otherwise, just print the expression. + O << *Op.getExpr(); + } } } diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 6d5ec62..4f5938d 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -1,4 +1,4 @@ -//===-- X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -----===// +//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -23,17 +23,16 @@ class MCOperand; class X86IntelInstPrinter : public MCInstPrinter { public: - X86IntelInstPrinter(const MCAsmInfo &MAI) - : MCInstPrinter(MAI) {} + X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot); - virtual StringRef getOpcodeName(unsigned Opcode) const; // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); - static const char *getInstructionName(unsigned Opcode); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 69ad7d7..32e40fe 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -7,10 +7,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/MC/MCAsmBackend.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86FixupKinds.h" -#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" @@ -37,18 +36,22 @@ MCDisableArithRelaxation("mc-x86-disable-arith-relaxation", static unsigned getFixupKindLog2Size(unsigned Kind) { switch (Kind) { - default: assert(0 && "invalid fixup kind!"); + default: llvm_unreachable("invalid fixup kind!"); case FK_PCRel_1: + case FK_SecRel_1: case FK_Data_1: return 0; case FK_PCRel_2: + case FK_SecRel_2: case FK_Data_2: return 1; case FK_PCRel_4: case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_movq_load: case X86::reloc_signed_4byte: case X86::reloc_global_offset_table: + case FK_SecRel_4: case FK_Data_4: return 2; case FK_PCRel_8: + case FK_SecRel_8: case FK_Data_8: return 3; } } @@ -57,9 +60,9 @@ namespace { class X86ELFObjectWriter : public MCELFObjectTargetWriter { public: - X86ELFObjectWriter(bool is64Bit, Triple::OSType OSType, uint16_t EMachine, - bool HasRelocationAddend) - : MCELFObjectTargetWriter(is64Bit, OSType, EMachine, HasRelocationAddend) {} + X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine, + bool HasRelocationAddend, bool foobar) + : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {} }; class X86AsmBackend : public MCAsmBackend { @@ -87,7 +90,7 @@ public: return Infos[Kind - FirstTargetFixupKind]; } - void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value) const { unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); @@ -105,11 +108,16 @@ public: Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); } - bool MayNeedRelaxation(const MCInst &Inst) const; + bool mayNeedRelaxation(const MCInst &Inst) const; + + bool fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCInstFragment *DF, + const MCAsmLayout &Layout) const; - void RelaxInstruction(const MCInst &Inst, MCInst &Res) const; + void relaxInstruction(const MCInst &Inst, MCInst &Res) const; - bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const; + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const; }; } // end anonymous namespace @@ -214,7 +222,7 @@ static unsigned getRelaxedOpcode(unsigned Op) { return getRelaxedOpcodeBranch(Op); } -bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const { +bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { // Branches can always be relaxed. if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode()) return true; @@ -244,9 +252,17 @@ bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const { return hasExp && !hasRIP; } +bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCInstFragment *DF, + const MCAsmLayout &Layout) const { + // Relax if the value is too big for a (signed) i8. + return int64_t(Value) != int64_t(int8_t(Value)); +} + // FIXME: Can tblgen help at all here to verify there aren't other instructions // we can relax? -void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const { +void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode()); @@ -262,10 +278,10 @@ void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const { Res.setOpcode(RelaxedOp); } -/// WriteNopData - Write optimal nops to the output file for the \arg Count +/// writeNopData - Write optimal nops to the output file for the \arg Count /// bytes. This returns the number of bytes written. It may return 0 if /// the \arg Count is more than the maximum optimal nops. -bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { +bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { static const uint8_t Nops[10][10] = { // nop {0x90}, @@ -310,9 +326,9 @@ bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { namespace { class ELFX86AsmBackend : public X86AsmBackend { public: - Triple::OSType OSType; - ELFX86AsmBackend(const Target &T, Triple::OSType _OSType) - : X86AsmBackend(T), OSType(_OSType) { + uint8_t OSABI; + ELFX86AsmBackend(const Target &T, uint8_t _OSABI) + : X86AsmBackend(T), OSABI(_OSABI) { HasReliableSymbolDifference = true; } @@ -324,31 +340,21 @@ public: class ELFX86_32AsmBackend : public ELFX86AsmBackend { public: - ELFX86_32AsmBackend(const Target &T, Triple::OSType OSType) - : ELFX86AsmBackend(T, OSType) {} + ELFX86_32AsmBackend(const Target &T, uint8_t OSABI) + : ELFX86AsmBackend(T, OSABI) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createELFObjectWriter(createELFObjectTargetWriter(), - OS, /*IsLittleEndian*/ true); - } - - MCELFObjectTargetWriter *createELFObjectTargetWriter() const { - return new X86ELFObjectWriter(false, OSType, ELF::EM_386, false); + return createX86ELFObjectWriter(OS, /*Is64Bit*/ false, OSABI); } }; class ELFX86_64AsmBackend : public ELFX86AsmBackend { public: - ELFX86_64AsmBackend(const Target &T, Triple::OSType OSType) - : ELFX86AsmBackend(T, OSType) {} + ELFX86_64AsmBackend(const Target &T, uint8_t OSABI) + : ELFX86AsmBackend(T, OSABI) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createELFObjectWriter(createELFObjectTargetWriter(), - OS, /*IsLittleEndian*/ true); - } - - MCELFObjectTargetWriter *createELFObjectTargetWriter() const { - return new X86ELFObjectWriter(true, OSType, ELF::EM_X86_64, true); + return createX86ELFObjectWriter(OS, /*Is64Bit*/ true, OSABI); } }; @@ -362,7 +368,7 @@ public: } MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createWinCOFFObjectWriter(OS, Is64Bit); + return createX86WinCOFFObjectWriter(OS, Is64Bit); } }; @@ -442,7 +448,8 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) { if (TheTriple.isOSWindows()) return new WindowsX86AsmBackend(T, false); - return new ELFX86_32AsmBackend(T, TheTriple.getOS()); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ELFX86_32AsmBackend(T, OSABI); } MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) { @@ -454,5 +461,6 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) { if (TheTriple.isOSWindows()) return new WindowsX86AsmBackend(T, true); - return new ELFX86_64AsmBackend(T, TheTriple.getOS()); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ELFX86_64AsmBackend(T, OSABI); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index e6ba705..a0bb6dc 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -19,7 +19,7 @@ #include "X86MCTargetDesc.h" #include "llvm/Support/DataTypes.h" -#include <cassert> +#include "llvm/Support/ErrorHandling.h" namespace llvm { @@ -164,7 +164,13 @@ namespace X86II { /// is some TLS offset from the picbase. /// /// This is the 32-bit TLS offset for Darwin TLS in PIC mode. - MO_TLVP_PIC_BASE + MO_TLVP_PIC_BASE, + + /// MO_SECREL - On a symbol operand this indicates that the immediate is + /// the offset from beginning of section. + /// + /// This is the TLS offset for the COFF/Windows TLS mechanism. + MO_SECREL }; enum { @@ -223,19 +229,13 @@ namespace X86II { // destinations are the same register. MRMInitReg = 32, - //// MRM_C1 - A mod/rm byte of exactly 0xC1. - MRM_C1 = 33, - MRM_C2 = 34, - MRM_C3 = 35, - MRM_C4 = 36, - MRM_C8 = 37, - MRM_C9 = 38, - MRM_E8 = 39, - MRM_F0 = 40, - MRM_F8 = 41, - MRM_F9 = 42, - MRM_D0 = 45, - MRM_D1 = 46, + //// MRM_XX - A mod/rm byte of exactly 0xXX. + MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36, + MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40, + MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46, + MRM_D4 = 47, MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50, + MRM_DB = 51, MRM_DC = 52, MRM_DD = 53, MRM_DE = 54, + MRM_DF = 55, /// RawFrmImm8 - This is used for the ENTER instruction, which has two /// immediates, the first of which is a 16-bit immediate (specified by @@ -295,8 +295,20 @@ namespace X86II { T8 = 13 << Op0Shift, TA = 14 << Op0Shift, A6 = 15 << Op0Shift, A7 = 16 << Op0Shift, - // TF - Prefix before and after 0x0F - TF = 17 << Op0Shift, + // T8XD - Prefix before and after 0x0F. Combination of T8 and XD. + T8XD = 17 << Op0Shift, + + // T8XS - Prefix before and after 0x0F. Combination of T8 and XS. + T8XS = 18 << Op0Shift, + + // TAXD - Prefix before and after 0x0F. Combination of TA and XD. + TAXD = 19 << Op0Shift, + + // XOP8 - Prefix to include use of imm byte. + XOP8 = 20 << Op0Shift, + + // XOP9 - Prefix to exclude use of imm byte. + XOP9 = 21 << Op0Shift, //===------------------------------------------------------------------===// // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. @@ -387,20 +399,24 @@ namespace X86II { /// and the additional register is encoded in VEX_VVVV prefix. VEX_4V = 1U << 2, + /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode + /// operand 3 with VEX.vvvv. + VEX_4VOp3 = 1U << 3, + /// VEX_I8IMM - Specifies that the last register used in a AVX instruction, /// must be encoded in the i8 immediate field. This usually happens in /// instructions with 4 operands. - VEX_I8IMM = 1U << 3, + VEX_I8IMM = 1U << 4, /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current /// instruction uses 256-bit wide registers. This is usually auto detected /// if a VR256 register is used, but some AVX instructions also have this /// field marked when using a f256 memory references. - VEX_L = 1U << 4, + VEX_L = 1U << 5, // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX // prefix. Usually used for scalar instructions. Needed by disassembler. - VEX_LIG = 1U << 5, + VEX_LIG = 1U << 6, /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents @@ -408,7 +424,15 @@ namespace X86II { /// storing a classifier in the imm8 field. To simplify our implementation, /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. - Has3DNow0F0FOpcode = 1U << 6 + Has3DNow0F0FOpcode = 1U << 7, + + /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in + /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. + MemOp4 = 1U << 8, + + /// XOP - Opcode prefix used by XOP instructions. + XOP = 1U << 9 + }; // getBaseOpcodeFor - This function returns the "base" X86 opcode for the @@ -426,7 +450,7 @@ namespace X86II { /// of the specified instruction. static inline unsigned getSizeOfImm(uint64_t TSFlags) { switch (TSFlags & X86II::ImmMask) { - default: assert(0 && "Unknown immediate size"); + default: llvm_unreachable("Unknown immediate size"); case X86II::Imm8: case X86II::Imm8PCRel: return 1; case X86II::Imm16: @@ -441,7 +465,7 @@ namespace X86II { /// TSFlags indicates that it is pc relative. static inline unsigned isImmPCRel(uint64_t TSFlags) { switch (TSFlags & X86II::ImmMask) { - default: assert(0 && "Unknown immediate size"); + default: llvm_unreachable("Unknown immediate size"); case X86II::Imm8PCRel: case X86II::Imm16PCRel: case X86II::Imm32PCRel: @@ -462,10 +486,10 @@ namespace X86II { /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only /// counted as one operand. /// - static inline int getMemoryOperandNo(uint64_t TSFlags) { + static inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) { switch (TSFlags & X86II::FormMask) { - case X86II::MRMInitReg: assert(0 && "FIXME: Remove this form"); - default: assert(0 && "Unknown FormMask value in getMemoryOperandNo!"); + case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this form"); + default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!"); case X86II::Pseudo: case X86II::RawFrm: case X86II::AddRegFrm: @@ -478,9 +502,12 @@ namespace X86II { return 0; case X86II::MRMSrcMem: { bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). + if (HasMemOp4) + ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). // FIXME: Maybe lea should have its own form? This is a horrible hack. //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || @@ -495,20 +522,24 @@ namespace X86II { case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: - return 0; - case X86II::MRM_C1: - case X86II::MRM_C2: - case X86II::MRM_C3: - case X86II::MRM_C4: - case X86II::MRM_C8: - case X86II::MRM_C9: - case X86II::MRM_E8: - case X86II::MRM_F0: - case X86II::MRM_F8: - case X86II::MRM_F9: - case X86II::MRM_D0: - case X86II::MRM_D1: + case X86II::MRM6m: case X86II::MRM7m: { + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + unsigned FirstMemOp = 0; + if (HasVEX_4V) + ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV). + return FirstMemOp; + } + case X86II::MRM_C1: case X86II::MRM_C2: + case X86II::MRM_C3: case X86II::MRM_C4: + case X86II::MRM_C8: case X86II::MRM_C9: + case X86II::MRM_E8: case X86II::MRM_F0: + case X86II::MRM_F8: case X86II::MRM_F9: + case X86II::MRM_D0: case X86II::MRM_D1: + case X86II::MRM_D4: case X86II::MRM_D8: + case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: + case X86II::MRM_DD: case X86II::MRM_DE: + case X86II::MRM_DF: return -1; } } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp new file mode 100644 index 0000000..5a42a80 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -0,0 +1,224 @@ +//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace { + class X86ELFObjectWriter : public MCELFObjectTargetWriter { + public: + X86ELFObjectWriter(bool is64Bit, uint8_t OSABI); + + virtual ~X86ELFObjectWriter(); + protected: + virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel, bool IsRelocWithSymbol, + int64_t Addend) const; + }; +} + +X86ELFObjectWriter::X86ELFObjectWriter(bool Is64Bit, uint8_t OSABI) + : MCELFObjectTargetWriter(Is64Bit, OSABI, + Is64Bit ? ELF::EM_X86_64 : ELF::EM_386, + /*HasRelocationAddend*/ Is64Bit) {} + +X86ELFObjectWriter::~X86ELFObjectWriter() +{} + +unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel, + bool IsRelocWithSymbol, + int64_t Addend) const { + // determine the type of the relocation + + MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? + MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); + unsigned Type; + if (is64Bit()) { + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + + case FK_Data_8: Type = ELF::R_X86_64_PC64; break; + case FK_Data_4: Type = ELF::R_X86_64_PC32; break; + case FK_Data_2: Type = ELF::R_X86_64_PC16; break; + + case FK_PCRel_8: + assert(Modifier == MCSymbolRefExpr::VK_None); + Type = ELF::R_X86_64_PC64; + break; + case X86::reloc_signed_4byte: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_riprel_4byte: + case FK_PCRel_4: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_X86_64_PC32; + break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_X86_64_PLT32; + break; + case MCSymbolRefExpr::VK_GOTPCREL: + Type = ELF::R_X86_64_GOTPCREL; + break; + case MCSymbolRefExpr::VK_GOTTPOFF: + Type = ELF::R_X86_64_GOTTPOFF; + break; + case MCSymbolRefExpr::VK_TLSGD: + Type = ELF::R_X86_64_TLSGD; + break; + case MCSymbolRefExpr::VK_TLSLD: + Type = ELF::R_X86_64_TLSLD; + break; + } + break; + case FK_PCRel_2: + assert(Modifier == MCSymbolRefExpr::VK_None); + Type = ELF::R_X86_64_PC16; + break; + case FK_PCRel_1: + assert(Modifier == MCSymbolRefExpr::VK_None); + Type = ELF::R_X86_64_PC8; + break; + } + } else { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + case FK_Data_8: Type = ELF::R_X86_64_64; break; + case X86::reloc_signed_4byte: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_X86_64_32S; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_X86_64_GOT32; + break; + case MCSymbolRefExpr::VK_GOTPCREL: + Type = ELF::R_X86_64_GOTPCREL; + break; + case MCSymbolRefExpr::VK_TPOFF: + Type = ELF::R_X86_64_TPOFF32; + break; + case MCSymbolRefExpr::VK_DTPOFF: + Type = ELF::R_X86_64_DTPOFF32; + break; + } + break; + case FK_Data_4: + Type = ELF::R_X86_64_32; + break; + case FK_Data_2: Type = ELF::R_X86_64_16; break; + case FK_PCRel_1: + case FK_Data_1: Type = ELF::R_X86_64_8; break; + } + } + } else { + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + + case X86::reloc_global_offset_table: + Type = ELF::R_386_GOTPC; + break; + + case X86::reloc_signed_4byte: + case FK_PCRel_4: + case FK_Data_4: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_386_PC32; + break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_386_PLT32; + break; + } + break; + } + } else { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + + case X86::reloc_global_offset_table: + Type = ELF::R_386_GOTPC; + break; + + // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode + // instead? + case X86::reloc_signed_4byte: + case FK_PCRel_4: + case FK_Data_4: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_386_32; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_386_GOT32; + break; + case MCSymbolRefExpr::VK_GOTOFF: + Type = ELF::R_386_GOTOFF; + break; + case MCSymbolRefExpr::VK_TLSGD: + Type = ELF::R_386_TLS_GD; + break; + case MCSymbolRefExpr::VK_TPOFF: + Type = ELF::R_386_TLS_LE_32; + break; + case MCSymbolRefExpr::VK_INDNTPOFF: + Type = ELF::R_386_TLS_IE; + break; + case MCSymbolRefExpr::VK_NTPOFF: + Type = ELF::R_386_TLS_LE; + break; + case MCSymbolRefExpr::VK_GOTNTPOFF: + Type = ELF::R_386_TLS_GOTIE; + break; + case MCSymbolRefExpr::VK_TLSLDM: + Type = ELF::R_386_TLS_LDM; + break; + case MCSymbolRefExpr::VK_DTPOFF: + Type = ELF::R_386_TLS_LDO_32; + break; + case MCSymbolRefExpr::VK_GOTTPOFF: + Type = ELF::R_386_TLS_IE_32; + break; + } + break; + case FK_Data_2: Type = ELF::R_386_16; break; + case FK_PCRel_1: + case FK_Data_1: Type = ELF::R_386_8; break; + } + } + } + + return Type; +} + +MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint8_t OSABI) { + MCELFObjectTargetWriter *MOTW = + new X86ELFObjectWriter(Is64Bit, OSABI); + return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index 17d242a..f2e34cb 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -1,4 +1,4 @@ -//===-- X86/X86FixupKinds.h - X86 Specific Fixup Entries --------*- C++ -*-===// +//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 2703100..afa545c 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -48,6 +48,8 @@ static const char *const x86_asm_table[] = { "{cc}", "cc", 0,0}; +void X86MCAsmInfoDarwin::anchor() { } + X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { bool is64Bit = T.getArch() == Triple::x86_64; if (is64Bit) @@ -80,6 +82,8 @@ X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple) : X86MCAsmInfoDarwin(Triple) { } +void X86ELFMCAsmInfo::anchor() { } + X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { if (T.getArch() == Triple::x86_64) PointerSize = 8; @@ -125,7 +129,23 @@ getNonexecutableStackSection(MCContext &Ctx) const { 0, SectionKind::getMetadata()); } -X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) { +void X86MCAsmInfoMicrosoft::anchor() { } + +X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { + if (Triple.getArch() == Triple::x86_64) { + GlobalPrefix = ""; + PrivateGlobalPrefix = ".L"; + } + + AsmTransCBE = x86_asm_table; + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; +} + +void X86MCAsmInfoGNUCOFF::anchor() { } + +X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { if (Triple.getArch() == Triple::x86_64) { GlobalPrefix = ""; PrivateGlobalPrefix = ".L"; @@ -135,4 +155,7 @@ X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) { AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index 2cd4c8e..b6b70fd 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -1,4 +1,4 @@ -//=====-- X86MCAsmInfo.h - X86 asm properties -----------------*- C++ -*--====// +//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===// // // The LLVM Compiler Infrastructure // @@ -21,7 +21,9 @@ namespace llvm { class Triple; - struct X86MCAsmInfoDarwin : public MCAsmInfoDarwin { + class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + public: explicit X86MCAsmInfoDarwin(const Triple &Triple); }; @@ -33,13 +35,23 @@ namespace llvm { MCStreamer &Streamer) const; }; - struct X86ELFMCAsmInfo : public MCAsmInfo { + class X86ELFMCAsmInfo : public MCAsmInfo { + virtual void anchor(); + public: explicit X86ELFMCAsmInfo(const Triple &Triple); virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const; }; - struct X86MCAsmInfoCOFF : public MCAsmInfoCOFF { - explicit X86MCAsmInfoCOFF(const Triple &Triple); + class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + virtual void anchor(); + public: + explicit X86MCAsmInfoMicrosoft(const Triple &Triple); + }; + + class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { + virtual void anchor(); + public: + explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); }; } // namespace llvm diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 2eee112..80990e5 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -1,4 +1,4 @@ -//===-- X86/X86MCCodeEmitter.cpp - Convert X86 code to machine code -------===// +//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===// // // The LLVM Compiler Infrastructure // @@ -46,6 +46,11 @@ public: return (STI.getFeatureBits() & X86::Mode64Bit) != 0; } + bool is32BitMode() const { + // FIXME: Can tablegen auto-generate this? + return (STI.getFeatureBits() & X86::Mode64Bit) == 0; + } + static unsigned GetX86RegNum(const MCOperand &MO) { return X86_MC::getX86RegNum(MO.getReg()); } @@ -63,9 +68,8 @@ public: unsigned OpNum) { unsigned SrcReg = MI.getOperand(OpNum).getReg(); unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum)); - if ((SrcReg >= X86::XMM8 && SrcReg <= X86::XMM15) || - (SrcReg >= X86::YMM8 && SrcReg <= X86::YMM15)) - SrcRegNum += 8; + if (X86II::isX86_64ExtendedReg(SrcReg)) + SrcRegNum |= 8; // The registers represented through VEX_VVVV should // be encoded in 1's complement form. @@ -86,7 +90,7 @@ public: } } - void EmitImmediate(const MCOperand &Disp, + void EmitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize, MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, @@ -155,9 +159,8 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) { return MCFixup::getKindForSize(Size, isPCRel); } -/// Is32BitMemOperand - Return true if the specified instruction with a memory -/// operand should emit the 0x67 prefix byte in 64-bit mode due to a 32-bit -/// memory operand. Op specifies the operand # of the memoperand. +/// Is32BitMemOperand - Return true if the specified instruction has +/// a 32-bit memory operand. Op specifies the operand # of the memoperand. static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) { const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); @@ -170,28 +173,71 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) { return false; } -/// StartsWithGlobalOffsetTable - Return true for the simple cases where this -/// expression starts with _GLOBAL_OFFSET_TABLE_. This is a needed to support -/// PIC on ELF i386 as that symbol is magic. We check only simple case that +/// Is64BitMemOperand - Return true if the specified instruction has +/// a 64-bit memory operand. Op specifies the operand # of the memoperand. +#ifndef NDEBUG +static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} +#endif + +/// Is16BitMemOperand - Return true if the specified instruction has +/// a 16-bit memory operand. Op specifies the operand # of the memoperand. +static bool Is16BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} + +/// StartsWithGlobalOffsetTable - Check if this expression starts with +/// _GLOBAL_OFFSET_TABLE_ and if it is of the form +/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF +/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that /// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start /// of a binary expression. -static bool StartsWithGlobalOffsetTable(const MCExpr *Expr) { +enum GlobalOffsetTableExprKind { + GOT_None, + GOT_Normal, + GOT_SymDiff +}; +static GlobalOffsetTableExprKind +StartsWithGlobalOffsetTable(const MCExpr *Expr) { + const MCExpr *RHS = 0; if (Expr->getKind() == MCExpr::Binary) { const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr); Expr = BE->getLHS(); + RHS = BE->getRHS(); } if (Expr->getKind() != MCExpr::SymbolRef) - return false; + return GOT_None; const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); const MCSymbol &S = Ref->getSymbol(); - return S.getName() == "_GLOBAL_OFFSET_TABLE_"; + if (S.getName() != "_GLOBAL_OFFSET_TABLE_") + return GOT_None; + if (RHS && RHS->getKind() == MCExpr::SymbolRef) + return GOT_SymDiff; + return GOT_Normal; } void X86MCCodeEmitter:: -EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind, - unsigned &CurByte, raw_ostream &OS, +EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, + MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const { const MCExpr *Expr = NULL; if (DispOp.isImm()) { @@ -210,12 +256,21 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind, // If we have an immoffset, add it to the expression. if ((FixupKind == FK_Data_4 || - FixupKind == MCFixupKind(X86::reloc_signed_4byte)) && - StartsWithGlobalOffsetTable(Expr)) { - assert(ImmOffset == 0); - - FixupKind = MCFixupKind(X86::reloc_global_offset_table); - ImmOffset = CurByte; + FixupKind == FK_Data_8 || + FixupKind == MCFixupKind(X86::reloc_signed_4byte))) { + GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr); + if (Kind != GOT_None) { + assert(ImmOffset == 0); + + FixupKind = MCFixupKind(X86::reloc_global_offset_table); + if (Kind == GOT_Normal) + ImmOffset = CurByte; + } else if (Expr->getKind() == MCExpr::SymbolRef) { + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + if (Ref->getKind() == MCSymbolRefExpr::VK_SECREL) { + FixupKind = MCFixupKind(FK_SecRel_4); + } + } } // If the fixup is pc-relative, we need to bias the value to be relative to @@ -234,7 +289,7 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind, Ctx); // Emit a symbolic constant as a fixup and 4 zeros. - Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind)); + Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind, Loc)); EmitConstant(0, Size, CurByte, OS); } @@ -270,7 +325,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // expression to emit. int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0; - EmitImmediate(Disp, 4, MCFixupKind(FixupKind), + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS, Fixups, -ImmSize); return; } @@ -294,7 +349,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, if (BaseReg == 0) { // [disp32] in X86-32 mode EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); - EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups); + EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups); return; } @@ -310,13 +365,13 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. if (Disp.isImm() && isDisp8(Disp.getImm())) { EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); return; } // Otherwise, emit the most general non-SIB encoding: [REG+disp32] EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, Fixups); return; } @@ -375,10 +430,10 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // Do we need to output a displacement? if (ForceDisp8) - EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); else if (ForceDisp32 || Disp.getImm() != 0) - EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, - Fixups); + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), + CurByte, OS, Fixups); } /// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix @@ -387,9 +442,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const { - bool HasVEX_4V = false; - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V) - HasVEX_4V = true; + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; // VEX_R: opcode externsion equivalent to REX.R in // 1's complement (inverted) form @@ -417,6 +471,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // opcode extension, or ignored, depending on the opcode byte) unsigned char VEX_W = 0; + // XOP: Use XOP prefix byte 0x8f instead of VEX. + unsigned char XOP = 0; + // VEX_5M (VEX m-mmmmm field): // // 0b00000: Reserved for future use @@ -424,7 +481,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 0b00010: implied 0F 38 leading opcode bytes // 0b00011: implied 0F 3A leading opcode bytes // 0b00100-0b11111: Reserved for future use - // + // 0b01000: XOP map select - 08h instructions with imm byte + // 0b10001: XOP map select - 09h instructions with no imm byte unsigned char VEX_5M = 0x1; // VEX_4V (VEX vvvv field): a register specifier @@ -455,27 +513,44 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) VEX_W = 1; + if ((TSFlags >> X86II::VEXShift) & X86II::XOP) + XOP = 1; + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) VEX_L = 1; switch (TSFlags & X86II::Op0Mask) { - default: assert(0 && "Invalid prefix!"); + default: llvm_unreachable("Invalid prefix!"); case X86II::T8: // 0F 38 VEX_5M = 0x2; break; case X86II::TA: // 0F 3A VEX_5M = 0x3; break; - case X86II::TF: // F2 0F 38 + case X86II::T8XS: // F3 0F 38 + VEX_PP = 0x2; + VEX_5M = 0x2; + break; + case X86II::T8XD: // F2 0F 38 VEX_PP = 0x3; VEX_5M = 0x2; break; + case X86II::TAXD: // F2 0F 3A + VEX_PP = 0x3; + VEX_5M = 0x3; + break; case X86II::XS: // F3 0F VEX_PP = 0x2; break; case X86II::XD: // F2 0F VEX_PP = 0x3; break; + case X86II::XOP8: + VEX_5M = 0x8; + break; + case X86II::XOP9: + VEX_5M = 0x9; + break; case X86II::A6: // Bypass: Not used by VEX case X86II::A7: // Bypass: Not used by VEX case X86II::TB: // Bypass: Not used by VEX @@ -483,6 +558,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, break; // No prefix! } + // Set the vector length to 256-bit if YMM0-YMM15 is used for (unsigned i = 0; i != MI.getNumOperands(); ++i) { if (!MI.getOperand(i).isReg()) @@ -495,7 +571,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // Classify VEX_B, VEX_4V, VEX_R, VEX_X unsigned CurOp = 0; switch (TSFlags & X86II::FormMask) { - case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!"); + case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!"); case X86II::MRMDestMem: { // MRMDestMem instructions forms: // MemAddr, src1(ModR/M) @@ -516,41 +592,50 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, VEX_R = 0x0; break; } - case X86II::MRMSrcMem: { + case X86II::MRMSrcMem: // MRMSrcMem instructions forms: // src1(ModR/M), MemAddr // src1(ModR/M), src2(VEX_4V), MemAddr // src1(ModR/M), MemAddr, imm8 // src1(ModR/M), MemAddr, src2(VEX_I8IMM) // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) VEX_R = 0x0; - unsigned MemAddrOffset = 1; - if (HasVEX_4V) { + if (HasVEX_4V) VEX_4V = getVEXRegisterEncoding(MI, 1); - MemAddrOffset++; - } if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemAddrOffset+X86::AddrBaseReg).getReg())) + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) VEX_B = 0x0; if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemAddrOffset+X86::AddrIndexReg).getReg())) + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) VEX_X = 0x0; + + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1); break; - } case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: + case X86II::MRM6m: case X86II::MRM7m: { // MRM[0-9]m instructions forms: // MemAddr - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + // src1(VEX_4V), MemAddr + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, 0); + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) VEX_X = 0x0; break; + } case X86II::MRMSrcReg: // MRMSrcReg instructions forms: // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) @@ -565,6 +650,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, VEX_4V = getVEXRegisterEncoding(MI, CurOp++); if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + CurOp++; + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); break; case X86II::MRMDestReg: // MRMDestReg instructions forms: @@ -605,14 +693,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); - if (VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { // 2 byte VEX prefix + if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix EmitByte(0xC5, CurByte, OS); EmitByte(LastByte | (VEX_R << 7), CurByte, OS); return; } // 3 byte VEX prefix - EmitByte(0xC4, CurByte, OS); + EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); EmitByte(LastByte | (VEX_W << 7), CurByte, OS); } @@ -647,7 +735,7 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, } switch (TSFlags & X86II::FormMask) { - case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!"); + case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!"); case X86II::MRMSrcReg: if (MI.getOperand(0).isReg() && X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) @@ -717,12 +805,12 @@ void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags, const MCInst &MI, raw_ostream &OS) const { switch (TSFlags & X86II::SegOvrMask) { - default: assert(0 && "Invalid segment!"); + default: llvm_unreachable("Invalid segment!"); case 0: // No segment override, check for explicit one on memory operand. if (MemOperand != -1) { // If the instruction has a memory operand. switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) { - default: assert(0 && "Unknown segment register!"); + default: llvm_unreachable("Unknown segment register!"); case 0: break; case X86::CS: EmitByte(0x2E, CurByte, OS); break; case X86::SS: EmitByte(0x36, CurByte, OS); break; @@ -763,8 +851,22 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EmitByte(0xF3, CurByte, OS); // Emit the address size opcode prefix as needed. - if ((TSFlags & X86II::AdSize) || - (MemOperand != -1 && is64BitMode() && Is32BitMemOperand(MI, MemOperand))) + bool need_address_override; + if (TSFlags & X86II::AdSize) { + need_address_override = true; + } else if (MemOperand == -1) { + need_address_override = false; + } else if (is64BitMode()) { + assert(!Is16BitMemOperand(MI, MemOperand)); + need_address_override = Is32BitMemOperand(MI, MemOperand); + } else if (is32BitMode()) { + assert(!Is64BitMemOperand(MI, MemOperand)); + need_address_override = Is16BitMemOperand(MI, MemOperand); + } else { + need_address_override = false; + } + + if (need_address_override) EmitByte(0x67, CurByte, OS); // Emit the operand size opcode prefix as needed. @@ -773,7 +875,7 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, bool Need0FPrefix = false; switch (TSFlags & X86II::Op0Mask) { - default: assert(0 && "Invalid prefix!"); + default: llvm_unreachable("Invalid prefix!"); case 0: break; // No prefix! case X86II::REP: break; // already handled. case X86II::TB: // Two-byte opcode prefix @@ -783,7 +885,15 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::A7: // 0F A7 Need0FPrefix = true; break; - case X86II::TF: // F2 0F 38 + case X86II::T8XS: // F3 0F 38 + EmitByte(0xF3, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::T8XD: // F2 0F 38 + EmitByte(0xF2, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::TAXD: // F2 0F 3A EmitByte(0xF2, CurByte, OS); Need0FPrefix = true; break; @@ -818,10 +928,12 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // FIXME: Pull this up into previous switch if REX can be moved earlier. switch (TSFlags & X86II::Op0Mask) { - case X86II::TF: // F2 0F 38 + case X86II::T8XS: // F3 0F 38 + case X86II::T8XD: // F2 0F 38 case X86II::T8: // 0F 38 EmitByte(0x38, CurByte, OS); break; + case X86II::TAXD: // F2 0F 3A case X86II::TA: // 0F 3A EmitByte(0x3A, CurByte, OS); break; @@ -859,18 +971,16 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned CurByte = 0; // Is this instruction encoded using the AVX VEX prefix? - bool HasVEXPrefix = false; + bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX; // It uses the VEX.VVVV field? - bool HasVEX_4V = false; - - if ((TSFlags >> X86II::VEXShift) & X86II::VEX) - HasVEXPrefix = true; - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V) - HasVEX_4V = true; + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + const unsigned MemOp4_I8IMMOperand = 2; // Determine where the memory operand starts, if present. - int MemoryOperand = X86II::getMemoryOperandNo(TSFlags); + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); if (MemoryOperand != -1) MemoryOperand += CurOp; if (!HasVEXPrefix) @@ -886,27 +996,29 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned SrcRegNum = 0; switch (TSFlags & X86II::FormMask) { case X86II::MRMInitReg: - assert(0 && "FIXME: Remove this form when the JIT moves to MCCodeEmitter!"); + llvm_unreachable("FIXME: Remove this form when the JIT moves to MCCodeEmitter!"); default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n"; - assert(0 && "Unknown FormMask value in X86MCCodeEmitter!"); + llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!"); case X86II::Pseudo: - assert(0 && "Pseudo instruction shouldn't be emitted"); + llvm_unreachable("Pseudo instruction shouldn't be emitted"); case X86II::RawFrm: EmitByte(BaseOpcode, CurByte, OS); break; case X86II::RawFrmImm8: EmitByte(BaseOpcode, CurByte, OS); - EmitImmediate(MI.getOperand(CurOp++), + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), CurByte, OS, Fixups); - EmitImmediate(MI.getOperand(CurOp++), 1, FK_Data_1, CurByte, OS, Fixups); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte, + OS, Fixups); break; case X86II::RawFrmImm16: EmitByte(BaseOpcode, CurByte, OS); - EmitImmediate(MI.getOperand(CurOp++), + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), CurByte, OS, Fixups); - EmitImmediate(MI.getOperand(CurOp++), 2, FK_Data_2, CurByte, OS, Fixups); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte, + OS, Fixups); break; case X86II::AddRegFrm: @@ -940,9 +1052,16 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) SrcRegNum++; + if(HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) + SrcRegNum++; + EmitRegModRMByte(MI.getOperand(SrcRegNum), GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); - CurOp = SrcRegNum + 1; + + // 2 operands skipped with HasMemOp4, comensate accordingly + CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; + if (HasVEX_4VOp3) + ++CurOp; break; case X86II::MRMSrcMem: { @@ -952,12 +1071,16 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, ++AddrOperands; ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). } + if(HasMemOp4) // Skip second register source (encoded in I8IMM) + ++FirstMemOp; EmitByte(BaseOpcode, CurByte, OS); EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), TSFlags, CurByte, OS, Fixups); CurOp += AddrOperands + 1; + if (HasVEX_4VOp3) + ++CurOp; break; } @@ -976,58 +1099,52 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: case X86II::MRM6m: case X86II::MRM7m: + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + CurOp++; EmitByte(BaseOpcode, CurByte, OS); EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m, TSFlags, CurByte, OS, Fixups); CurOp += X86::AddrNumOperands; break; - case X86II::MRM_C1: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xC1, CurByte, OS); - break; - case X86II::MRM_C2: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xC2, CurByte, OS); - break; - case X86II::MRM_C3: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xC3, CurByte, OS); - break; - case X86II::MRM_C4: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xC4, CurByte, OS); - break; - case X86II::MRM_C8: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xC8, CurByte, OS); - break; - case X86II::MRM_C9: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xC9, CurByte, OS); - break; - case X86II::MRM_E8: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xE8, CurByte, OS); - break; - case X86II::MRM_F0: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xF0, CurByte, OS); - break; - case X86II::MRM_F8: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xF8, CurByte, OS); - break; + case X86II::MRM_C1: case X86II::MRM_C2: + case X86II::MRM_C3: case X86II::MRM_C4: + case X86II::MRM_C8: case X86II::MRM_C9: + case X86II::MRM_D0: case X86II::MRM_D1: + case X86II::MRM_D4: case X86II::MRM_D8: + case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: + case X86II::MRM_DD: case X86II::MRM_DE: + case X86II::MRM_DF: case X86II::MRM_E8: + case X86II::MRM_F0: case X86II::MRM_F8: case X86II::MRM_F9: EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xF9, CurByte, OS); - break; - case X86II::MRM_D0: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xD0, CurByte, OS); - break; - case X86II::MRM_D1: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xD1, CurByte, OS); + + unsigned char MRM; + switch (TSFlags & X86II::FormMask) { + default: llvm_unreachable("Invalid Form"); + case X86II::MRM_C1: MRM = 0xC1; break; + case X86II::MRM_C2: MRM = 0xC2; break; + case X86II::MRM_C3: MRM = 0xC3; break; + case X86II::MRM_C4: MRM = 0xC4; break; + case X86II::MRM_C8: MRM = 0xC8; break; + case X86II::MRM_C9: MRM = 0xC9; break; + case X86II::MRM_D0: MRM = 0xD0; break; + case X86II::MRM_D1: MRM = 0xD1; break; + case X86II::MRM_D4: MRM = 0xD4; break; + case X86II::MRM_D8: MRM = 0xD8; break; + case X86II::MRM_D9: MRM = 0xD9; break; + case X86II::MRM_DA: MRM = 0xDA; break; + case X86II::MRM_DB: MRM = 0xDB; break; + case X86II::MRM_DC: MRM = 0xDC; break; + case X86II::MRM_DD: MRM = 0xDD; break; + case X86II::MRM_DE: MRM = 0xDE; break; + case X86II::MRM_DF: MRM = 0xDF; break; + case X86II::MRM_E8: MRM = 0xE8; break; + case X86II::MRM_F0: MRM = 0xF0; break; + case X86II::MRM_F8: MRM = 0xF8; break; + case X86II::MRM_F9: MRM = 0xF9; break; + } + EmitByte(MRM, CurByte, OS); break; } @@ -1035,14 +1152,26 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // according to the right size for the instruction. if (CurOp != NumOps) { // The last source register of a 4 operand instruction in AVX is encoded - // in bits[7:4] of a immediate byte, and bits[3:0] are ignored. + // in bits[7:4] of a immediate byte. if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { - const MCOperand &MO = MI.getOperand(CurOp++); + const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand + : CurOp); + CurOp++; bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg()); unsigned RegNum = (IsExtReg ? (1 << 7) : 0); RegNum |= GetX86RegNum(MO) << 4; - EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS, - Fixups); + // If there is an additional 5th operand it must be an immediate, which + // is encoded in bits[3:0] + if(CurOp != NumOps) { + const MCOperand &MIMM = MI.getOperand(CurOp++); + if(MIMM.isImm()) { + unsigned Val = MIMM.getImm(); + assert(Val < 16 && "Immediate operand value out of range"); + RegNum |= Val; + } + } + EmitImmediate(MCOperand::CreateImm(RegNum), MI.getLoc(), 1, FK_Data_1, + CurByte, OS, Fixups); } else { unsigned FixupKind; // FIXME: Is there a better way to know that we need a signed relocation? @@ -1053,7 +1182,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, FixupKind = X86::reloc_signed_4byte; else FixupKind = getImmFixupKind(TSFlags); - EmitImmediate(MI.getOperand(CurOp++), + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind), CurByte, OS, Fixups); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index f98d5e3..3482363 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -1,4 +1,4 @@ -//===-- X86MCTargetDesc.cpp - X86 Target Descriptions -----------*- C++ -*-===// +//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===// // // The LLVM Compiler Infrastructure // @@ -24,6 +24,7 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/Support/Host.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" #define GET_REGINFO_MC_DESC @@ -35,6 +36,10 @@ #define GET_SUBTARGETINFO_MC_DESC #include "X86GenSubtargetInfo.inc" +#if _MSC_VER +#include <intrin.h> +#endif + using namespace llvm; @@ -45,10 +50,6 @@ std::string X86_MC::ParseX86Triple(StringRef TT) { FS = "+64bit-mode"; else FS = "-64bit-mode"; - if (TheTriple.getOS() == Triple::NativeClient) - FS += ",+nacl-mode"; - else - FS += ",-nacl-mode"; return FS; } @@ -76,6 +77,8 @@ bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, *rECX = registers[2]; *rEDX = registers[3]; return false; + #else + return true; #endif #elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) #if defined(__GNUC__) @@ -102,9 +105,81 @@ bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, mov dword ptr [esi],edx } return false; + #else + return true; #endif +#else + return true; #endif +} + +/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the +/// 4 values in the specified arguments. If we can't run cpuid on the host, +/// return true. +bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, + unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { +#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) + #if defined(__GNUC__) + // gcc desn't know cpuid would clobber ebx/rbx. Preseve it manually. + asm ("movq\t%%rbx, %%rsi\n\t" + "cpuid\n\t" + "xchgq\t%%rbx, %%rsi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value), + "c" (subleaf)); + return false; + #elif defined(_MSC_VER) + // __cpuidex was added in MSVC++ 9.0 SP1 + #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729) + int registers[4]; + __cpuidex(registers, value, subleaf); + *rEAX = registers[0]; + *rEBX = registers[1]; + *rECX = registers[2]; + *rEDX = registers[3]; + return false; + #else + return true; + #endif + #else + return true; + #endif +#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) + #if defined(__GNUC__) + asm ("movl\t%%ebx, %%esi\n\t" + "cpuid\n\t" + "xchgl\t%%ebx, %%esi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value), + "c" (subleaf)); + return false; + #elif defined(_MSC_VER) + __asm { + mov eax,value + mov ecx,subleaf + cpuid + mov esi,rEAX + mov dword ptr [esi],eax + mov esi,rEBX + mov dword ptr [esi],ebx + mov esi,rECX + mov dword ptr [esi],ecx + mov esi,rEDX + mov dword ptr [esi],edx + } + return false; + #else + return true; + #endif +#else return true; +#endif } void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family, @@ -261,7 +336,8 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU, std::string CPUName = CPU; if (CPUName.empty()) { -#if defined (__x86_64__) || defined(__i386__) +#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ + || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) CPUName = sys::getHostCPUName(); #else CPUName = "generic"; @@ -303,8 +379,10 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { MAI = new X86_64MCAsmInfoDarwin(TheTriple); else MAI = new X86MCAsmInfoDarwin(TheTriple); - } else if (TheTriple.isOSWindows()) { - MAI = new X86MCAsmInfoCOFF(TheTriple); + } else if (TheTriple.getOS() == Triple::Win32) { + MAI = new X86MCAsmInfoMicrosoft(TheTriple); + } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) { + MAI = new X86MCAsmInfoGNUCOFF(TheTriple); } else { MAI = new X86ELFMCAsmInfo(TheTriple); } @@ -327,7 +405,8 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { } static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, - CodeModel::Model CM) { + CodeModel::Model CM, + CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); Triple T(TT); @@ -371,7 +450,7 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, // 64-bit JIT places everything in the same buffer except external funcs. CM = is64Bit ? CodeModel::Large : CodeModel::Small; - X->InitMCCodeGenInfo(RM, CM); + X->InitMCCodeGenInfo(RM, CM, OL); return X; } @@ -395,11 +474,13 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, static MCInstPrinter *createX86MCInstPrinter(const Target &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI) { if (SyntaxVariant == 0) - return new X86ATTInstPrinter(MAI); + return new X86ATTInstPrinter(MAI, MII, MRI); if (SyntaxVariant == 1) - return new X86IntelInstPrinter(MAI); + return new X86IntelInstPrinter(MAI, MII, MRI); return 0; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index c144c51..9896cbe 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -54,6 +54,11 @@ namespace X86_MC { /// the specified arguments. If we can't run cpuid on the host, return true. bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX, unsigned *rECX, unsigned *rEDX); + /// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return + /// the 4 values in the specified arguments. If we can't run cpuid on the + /// host, return true. + bool GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, + unsigned *rEBX, unsigned *rECX, unsigned *rEDX); void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model); @@ -83,6 +88,12 @@ MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, uint32_t CPUType, uint32_t CPUSubtype); +/// createX86ELFObjectWriter - Construct an X86 ELF object writer. +MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint8_t OSABI); +/// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer. +MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit); } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp new file mode 100644 index 0000000..bc272ef --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -0,0 +1,65 @@ +//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCWinCOFFObjectWriter.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace llvm { + class MCObjectWriter; +} + +namespace { + class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { + const bool Is64Bit; + + public: + X86WinCOFFObjectWriter(bool Is64Bit_); + ~X86WinCOFFObjectWriter(); + + virtual unsigned getRelocType(unsigned FixupKind) const; + }; +} + +X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_) + : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 : + COFF::IMAGE_FILE_MACHINE_I386), + Is64Bit(Is64Bit_) {} + +X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {} + +unsigned X86WinCOFFObjectWriter::getRelocType(unsigned FixupKind) const { + switch (FixupKind) { + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32; + case FK_Data_4: + case X86::reloc_signed_4byte: + return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32; + case FK_Data_8: + if (Is64Bit) + return COFF::IMAGE_REL_AMD64_ADDR64; + llvm_unreachable("unsupported relocation type"); + case FK_SecRel_4: + return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL; + default: + llvm_unreachable("unsupported relocation type"); + } +} + +MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS, + bool Is64Bit) { + MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit); + return createWinCOFFObjectWriter(MOTW, OS); +} diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index aeb3309..32c722a 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -20,7 +20,7 @@ namespace llvm { -void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask) { +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { // Defaults the copying the dest value. ShuffleMask.push_back(0); ShuffleMask.push_back(1); @@ -44,8 +44,7 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask) { } // <3,1> or <6,7,2,3> -void DecodeMOVHLPSMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { +void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { for (unsigned i = NElts/2; i != NElts; ++i) ShuffleMask.push_back(NElts+i); @@ -54,8 +53,7 @@ void DecodeMOVHLPSMask(unsigned NElts, } // <0,2> or <0,1,4,5> -void DecodeMOVLHPSMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { +void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { for (unsigned i = 0; i != NElts/2; ++i) ShuffleMask.push_back(i); @@ -63,16 +61,26 @@ void DecodeMOVLHPSMask(unsigned NElts, ShuffleMask.push_back(NElts+i); } -void DecodePSHUFMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts; ++i) { - ShuffleMask.push_back(Imm % NElts); - Imm /= NElts; +/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. +/// VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + int NewImm = Imm; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + l); + NewImm /= NumLaneElts; + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm } } -void DecodePSHUFHWMask(unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { +void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { ShuffleMask.push_back(0); ShuffleMask.push_back(1); ShuffleMask.push_back(2); @@ -83,8 +91,7 @@ void DecodePSHUFHWMask(unsigned Imm, } } -void DecodePSHUFLWMask(unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { +void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { for (unsigned i = 0; i != 4; ++i) { ShuffleMask.push_back((Imm & 3)); Imm >>= 2; @@ -95,76 +102,35 @@ void DecodePSHUFLWMask(unsigned Imm, ShuffleMask.push_back(7); } -void DecodePUNPCKLBWMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i8, NElts), ShuffleMask); -} - -void DecodePUNPCKLWDMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i16, NElts), ShuffleMask); -} - -void DecodePUNPCKLDQMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); -} - -void DecodePUNPCKLQDQMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); -} - -void DecodePUNPCKLMask(EVT VT, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(VT, ShuffleMask); -} - -void DecodePUNPCKHMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i+NElts/2); - ShuffleMask.push_back(i+NElts+NElts/2); - } -} +/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates +/// the type of the vector allowing it to handle different datatypes and vector +/// widths. +void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); -void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - // Part that reads from dest. - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(Imm % NElts); - Imm /= NElts; - } - // Part that reads from src. - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(Imm % NElts + NElts); - Imm /= NElts; - } -} + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; -void DecodeUNPCKHPMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i+NElts/2); // Reads from dest - ShuffleMask.push_back(i+NElts+NElts/2); // Reads from src + int NewImm = Imm; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + // Part that reads from dest. + for (unsigned i = 0; i != NumLaneElts/2; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + l); + NewImm /= NumLaneElts; + } + // Part that reads from src. + for (unsigned i = 0; i != NumLaneElts/2; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + NumElts + l); + NewImm /= NumLaneElts; + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm } } -void DecodeUNPCKLPSMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); -} - -void DecodeUNPCKLPDMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); -} - -/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// etc. VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. -void DecodeUNPCKLPMask(EVT VT, - SmallVectorImpl<unsigned> &ShuffleMask) { +/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd +/// and punpckh*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -173,55 +139,36 @@ void DecodeUNPCKLPMask(EVT VT, if (NumLanes == 0 ) NumLanes = 1; // Handle MMX unsigned NumLaneElts = NumElts / NumLanes; - unsigned Start = 0; - unsigned End = NumLaneElts / 2; - for (unsigned s = 0; s < NumLanes; ++s) { - for (unsigned i = Start; i != End; ++i) { - ShuffleMask.push_back(i); // Reads from dest/src1 - ShuffleMask.push_back(i+NumLaneElts); // Reads from src/src2 + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumElts); // Reads from src/src2 } - // Process the next 128 bits. - Start += NumLaneElts; - End += NumLaneElts; } } -// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit -// elements. For 256-bit vectors, it's considered as two 128 lanes, the -// referenced elements can't cross lanes and the mask of the first lane must -// be the same of the second. -void DecodeVPERMILPSMask(unsigned NumElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - unsigned NumLanes = (NumElts*32)/128; - unsigned LaneSize = NumElts/NumLanes; - - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = 0; i != LaneSize; ++i) { - unsigned Idx = (Imm >> (i*2)) & 0x3 ; - ShuffleMask.push_back(Idx+(l*LaneSize)); - } - } -} +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// and punpckl*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); -// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit -// elements. For 256-bit vectors, it's considered as two 128 lanes, the -// referenced elements can't cross lanes but the mask of the first lane can -// be the different of the second (not like VPERMILPS). -void DecodeVPERMILPDMask(unsigned NumElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - unsigned NumLanes = (NumElts*64)/128; - unsigned LaneSize = NumElts/NumLanes; - - for (unsigned l = 0; l < NumLanes; ++l) { - for (unsigned i = l*LaneSize; i < LaneSize*(l+1); ++i) { - unsigned Idx = (Imm >> i) & 0x1; - ShuffleMask.push_back(Idx+(l*LaneSize)); + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumElts); // Reads from src/src2 } } } -void DecodeVPERM2F128Mask(EVT VT, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { +void DecodeVPERM2X128Mask(EVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { unsigned HalfSize = VT.getVectorNumElements()/2; unsigned FstHalfBegin = (Imm & 0x3) * HalfSize; unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize; @@ -232,12 +179,4 @@ void DecodeVPERM2F128Mask(EVT VT, unsigned Imm, ShuffleMask.push_back(i); } -void DecodeVPERM2F128Mask(unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - // VPERM2F128 is used by any 256-bit EVT, but X86InstComments only - // has information about the instruction and not the types. So for - // instruction comments purpose, assume the 256-bit vector is v4i64. - return DecodeVPERM2F128Mask(MVT::v4i64, Imm, ShuffleMask); -} - } // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 58193e6..5b8c6ef 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -24,83 +24,41 @@ namespace llvm { enum { - SM_SentinelZero = ~0U + SM_SentinelZero = -1 }; -void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask); +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); // <3,1> or <6,7,2,3> -void DecodeMOVHLPSMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); +void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); // <0,2> or <0,1,4,5> -void DecodeMOVLHPSMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); +void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); -void DecodePSHUFMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); +void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); -void DecodePSHUFHWMask(unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); +void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); -void DecodePSHUFLWMask(unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); +void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); -void DecodePUNPCKLBWMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); +/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates +/// the type of the vector allowing it to handle different datatypes and vector +/// widths. +void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); -void DecodePUNPCKLWDMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); +/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd +/// and punpckh*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask); -void DecodePUNPCKLDQMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// and punpckl*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask); -void DecodePUNPCKLQDQMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); -void DecodePUNPCKLMask(EVT VT, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodePUNPCKHMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodeUNPCKHPMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodeUNPCKLPSMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodeUNPCKLPDMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask); - -/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// etc. VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. -void DecodeUNPCKLPMask(EVT VT, - SmallVectorImpl<unsigned> &ShuffleMask); - - -// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit -// elements. For 256-bit vectors, it's considered as two 128 lanes, the -// referenced elements can't cross lanes and the mask of the first lane must -// be the same of the second. -void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); - -// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit -// elements. For 256-bit vectors, it's considered as two 128 lanes, the -// referenced elements can't cross lanes but the mask of the first lane can -// be the different of the second (not like VPERMILPS). -void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); - -void DecodeVPERM2F128Mask(unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); -void DecodeVPERM2F128Mask(EVT VT, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask); +void DecodeVPERM2X128Mask(EVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask); } // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index 81e9422..ecc7b59 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -24,8 +24,6 @@ namespace llvm { class FunctionPass; class JITCodeEmitter; -class MachineCodeEmitter; -class Target; class X86TargetMachine; /// createX86ISelDag - This pass converts a legalized DAG into a diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index 104b91f..b6591d4 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -1,4 +1,4 @@ -//===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===// +//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -23,9 +23,6 @@ include "llvm/Target/Target.td" def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true", "64-bit mode (x86_64)">; -def ModeNaCl : SubtargetFeature<"nacl-mode", "InNaClMode", "true", - "Native Client mode">; - //===----------------------------------------------------------------------===// // X86 Subtarget features. //===----------------------------------------------------------------------===// @@ -58,7 +55,7 @@ def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41", [FeatureSSSE3]>; def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", - [FeatureSSE41, FeaturePOPCNT]>; + [FeatureSSE41]>; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", "Enable 3DNow! instructions", [FeatureMMX]>; @@ -81,16 +78,24 @@ def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", "Fast unaligned memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", - [FeaturePOPCNT]>; + [FeatureSSE3]>; -def FeatureAVX : SubtargetFeature<"avx", "HasAVX", "true", - "Enable AVX instructions">; +def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", + "Enable AVX instructions", + [FeatureSSE42]>; +def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", + "Enable AVX2 instructions", + [FeatureAVX]>; def FeatureCLMUL : SubtargetFeature<"clmul", "HasCLMUL", "true", "Enable carry-less multiplication instructions">; def FeatureFMA3 : SubtargetFeature<"fma3", "HasFMA3", "true", - "Enable three-operand fused multiple-add">; + "Enable three-operand fused multiple-add", + [FeatureAVX]>; def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", - "Enable four-operand fused multiple-add">; + "Enable four-operand fused multiple-add", + [FeatureAVX]>; +def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", + "Enable XOP instructions">; def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", "HasVectorUAMem", "true", "Allow unaligned memory operands on vector/SIMD instructions">; @@ -102,17 +107,31 @@ def FeatureRDRAND : SubtargetFeature<"rdrand", "HasRDRAND", "true", "Support RDRAND instruction">; def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", "Support 16-bit floating point conversion instructions">; +def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true", + "Support FS/GS Base instructions">; def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", "Support LZCNT instruction">; def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", "Support BMI instructions">; +def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", + "Support BMI2 instructions">; +def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", + "Use LEA for adjusting the stack pointer">; //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// +include "X86Schedule.td" + +def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", + "Intel Atom processors">; + class Proc<string Name, list<SubtargetFeature> Features> - : Processor<Name, NoItineraries, Features>; + : Processor<Name, GenericItineraries, Features>; + +class AtomProc<string Name, list<SubtargetFeature> Features> + : Processor<Name, AtomItineraries, Features>; def : Proc<"generic", []>; def : Proc<"i386", []>; @@ -137,34 +156,38 @@ def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; -def : Proc<"atom", [FeatureSSE3, FeatureCMPXCHG16B, FeatureMOVBE, - FeatureSlowBTMem]>; +def : AtomProc<"atom", [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B, + FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>; // "Arrandale" along with corei3 and corei5 def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureFastUAMem, FeatureAES]>; + FeatureSlowBTMem, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES]>; def : Proc<"nehalem", [FeatureSSE42, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureFastUAMem]>; + FeatureSlowBTMem, FeatureFastUAMem, + FeaturePOPCNT]>; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureFastUAMem, FeatureAES, - FeatureCLMUL]>; + FeatureSlowBTMem, FeatureFastUAMem, + FeaturePOPCNT, FeatureAES, FeatureCLMUL]>; // Sandy Bridge // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. // FIXME: Disabling AVX for now since it's not ready. -def : Proc<"corei7-avx", [FeatureSSE42, FeatureCMPXCHG16B, +def : Proc<"corei7-avx", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT, FeatureAES, FeatureCLMUL]>; // Ivy Bridge -def : Proc<"core-avx-i", [FeatureSSE42, FeatureCMPXCHG16B, +def : Proc<"core-avx-i", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT, FeatureAES, FeatureCLMUL, - FeatureRDRAND, FeatureF16C]>; + FeatureRDRAND, FeatureF16C, FeatureFSGSBase]>; // Haswell -def : Proc<"core-avx2", [FeatureSSE42, FeatureCMPXCHG16B, FeatureAES, - FeatureCLMUL, FeatureRDRAND, FeatureF16C, - FeatureFMA3, FeatureMOVBE, FeatureLZCNT, - FeatureBMI]>; +// FIXME: Disabling AVX/AVX2/FMA3 for now since it's not ready. +def : Proc<"core-avx2", [FeatureSSE42, FeatureCMPXCHG16B, FeaturePOPCNT, + FeatureAES, FeatureCLMUL, FeatureRDRAND, + FeatureF16C, FeatureFSGSBase, + FeatureMOVBE, FeatureLZCNT, FeatureBMI, + FeatureBMI2]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; @@ -189,15 +212,21 @@ def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, FeatureSlowBTMem]>; def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; -def : Proc<"barcelona", [FeatureSSE3, FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem]>; -def : Proc<"istanbul", [Feature3DNowA, FeatureCMPXCHG16B, - FeatureSSE4A, Feature3DNowA]>; -def : Proc<"shanghai", [Feature3DNowA, FeatureCMPXCHG16B, FeatureSSE4A, - Feature3DNowA]>; + Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowBTMem]>; +// Bobcat +def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, + FeatureLZCNT, FeaturePOPCNT]>; +// FIXME: Disabling AVX/FMA4 for now since it's not ready. +// Bulldozer +def : Proc<"bdver1", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B, + FeatureAES, FeatureCLMUL, + FeatureXOP, FeatureLZCNT, FeaturePOPCNT]>; +// Enhanced Bulldozer +def : Proc<"bdver2", [FeatureSSE42, FeatureSSE4A, FeatureCMPXCHG16B, + FeatureAES, FeatureCLMUL, + FeatureXOP, FeatureF16C, FeatureLZCNT, + FeaturePOPCNT, FeatureBMI]>; def : Proc<"winchip-c6", [FeatureMMX]>; def : Proc<"winchip2", [Feature3DNow]>; @@ -229,9 +258,11 @@ include "X86CallingConv.td" // Assembly Parser //===----------------------------------------------------------------------===// -// Currently the X86 assembly parser only supports ATT syntax. def ATTAsmParser : AsmParser { - string AsmParserClassName = "ATTAsmParser"; + string AsmParserClassName = "AsmParser"; +} + +def ATTAsmParserVariant : AsmParserVariant { int Variant = 0; // Discard comments in assembly strings. @@ -241,6 +272,16 @@ def ATTAsmParser : AsmParser { string RegisterPrefix = "%"; } +def IntelAsmParserVariant : AsmParserVariant { + int Variant = 1; + + // Discard comments in assembly strings. + string CommentDelimiter = ";"; + + // Recognize hard coded registers. + string RegisterPrefix = ""; +} + //===----------------------------------------------------------------------===// // Assembly Printers //===----------------------------------------------------------------------===// @@ -261,8 +302,7 @@ def IntelAsmWriter : AsmWriter { def X86 : Target { // Information about the instructions... let InstructionSet = X86InstrInfo; - let AssemblyParsers = [ATTAsmParser]; - + let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant]; let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; } diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index 4c3ff02..7db7ccb 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -13,13 +13,12 @@ //===----------------------------------------------------------------------===// #include "X86AsmPrinter.h" -#include "InstPrinter/X86ATTInstPrinter.h" -#include "InstPrinter/X86IntelInstPrinter.h" #include "X86MCInstLower.h" #include "X86.h" #include "X86COFFMachineModuleInfo.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" +#include "InstPrinter/X86ATTInstPrinter.h" #include "llvm/CallingConv.h" #include "llvm/DerivedTypes.h" #include "llvm/Module.h" @@ -199,6 +198,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, case X86II::MO_TLVP_PIC_BASE: O << "@TLVP" << '-' << *MF->getPICBaseSymbol(); break; + case X86II::MO_SECREL: O << "@SECREL"; break; } } @@ -264,16 +264,40 @@ void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op, raw_ostream &O) { unsigned char value = MI->getOperand(Op).getImm(); - assert(value <= 7 && "Invalid ssecc argument!"); switch (value) { - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; + default: llvm_unreachable("Invalid ssecc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; } } @@ -575,7 +599,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing() && - MMI->callsExternalVAFunctionWithFloatingPointArguments()) { + MMI->usesVAFloatArgument()) { StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused"; MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName); OutStreamer.EmitSymbolAttribute(S, MCSA_Global); diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index 3a50435..a6ed9ba 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -17,7 +17,6 @@ #include "X86.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" -#include "llvm/ADT/StringSet.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/ValueTypes.h" @@ -25,11 +24,7 @@ namespace llvm { -class MachineJumpTableInfo; -class MCContext; -class MCInst; class MCStreamer; -class MCSymbol; class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { const X86Subtarget *Subtarget; diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp index 4326814..e01ff41 100644 --- a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/X86COFFMachineModuleInfo.cpp -------------------------===// +//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h index 98ab2a6..0cec95a 100644 --- a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h +++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/X86COFFMachineModuleInfo.h -----------------*- C++ -*-===// +//===-- X86COFFMachineModuleInfo.h - X86 COFF MMI Impl ----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -14,9 +14,9 @@ #ifndef X86COFF_MACHINEMODULEINFO_H #define X86COFF_MACHINEMODULEINFO_H +#include "X86MachineFunctionInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/ADT/DenseSet.h" -#include "X86MachineFunctionInfo.h" namespace llvm { class X86MachineFunctionInfo; diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index 77b9905..d148989 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -1,10 +1,10 @@ -//===- X86CallingConv.td - Calling Conventions X86 32/64 ---*- tablegen -*-===// -// +//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This describes the calling conventions for the X86-32 and X86-64 @@ -61,7 +61,7 @@ def RetCC_X86_32_C : CallingConv<[ // weirdly; this is really the sse-regparm calling convention) in which // case they use XMM0, otherwise it is the same as the common X86 calling // conv. - CCIfInReg<CCIfSubtarget<"hasXMMInt()", + CCIfInReg<CCIfSubtarget<"hasSSE2()", CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>, CCDelegateTo<RetCC_X86Common> @@ -73,8 +73,8 @@ def RetCC_X86_32_Fast : CallingConv<[ // SSE2. // This can happen when a float, 2 x float, or 3 x float vector is split by // target lowering, and is returned in 1-3 sse regs. - CCIfType<[f32], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, - CCIfType<[f64], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, // For integers, ECX can be used as an extra return register CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>, @@ -150,18 +150,23 @@ def CC_X86_64_C : CallingConv<[ // The first 8 MMX vector arguments are passed in XMM registers on Darwin. CCIfType<[x86mmx], CCIfSubtarget<"isTargetDarwin()", - CCIfSubtarget<"hasXMMInt()", + CCIfSubtarget<"hasSSE2()", CCPromoteToType<v2i64>>>>, // The first 8 FP/Vector arguments are passed in XMM registers. CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfSubtarget<"hasXMM()", + CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, - // The first 8 256-bit vector arguments are passed in YMM registers. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasAVX()", - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]>>>, + // The first 8 256-bit vector arguments are passed in YMM registers, unless + // this is a vararg function. + // FIXME: This isn't precisely correct; the x86-64 ABI document says that + // fixed arguments to vararg functions are supposed to be passed in + // registers. Actually modeling that would be a lot of work, though. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, + YMM4, YMM5, YMM6, YMM7]>>>>, // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. @@ -193,6 +198,10 @@ def CC_X86_Win64_C : CallingConv<[ // 128 bit vectors are passed by pointer CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>, + + // 256 bit vectors are passed by pointer + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, + // The first 4 MMX vector arguments are passed in GPRs. CCIfType<[x86mmx], CCBitConvertToType<i64>>, @@ -233,7 +242,7 @@ def CC_X86_64_GHC : CallingConv<[ // Pass in STG registers: F1, F2, F3, F4, D1, D2 CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfSubtarget<"hasXMM()", + CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> ]>; @@ -251,7 +260,7 @@ def CC_X86_32_Common : CallingConv<[ // The first 3 float or double arguments, if marked 'inreg' and if the call // is not a vararg call and if SSE2 is available, are passed in SSE registers. CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64], - CCIfSubtarget<"hasXMMInt()", + CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>, // The first 3 __m64 vector arguments are passed in mmx registers if the @@ -322,8 +331,8 @@ def CC_X86_32_ThisCall : CallingConv<[ // Promote i8/i16 arguments to i32. CCIfType<[i8, i16], CCPromoteToType<i32>>, - // The 'nest' parameter, if any, is passed in EAX. - CCIfNest<CCAssignToReg<[EAX]>>, + // Pass sret arguments indirectly through EAX + CCIfSRet<CCAssignToReg<[EAX]>>, // The first integer argument is passed in ECX CCIfType<[i32], CCAssignToReg<[ECX]>>, @@ -350,7 +359,7 @@ def CC_X86_32_FastCC : CallingConv<[ // The first 3 float or double arguments, if the call is not a vararg // call and if SSE2 is available, are passed in SSE registers. CCIfNotVarArg<CCIfType<[f32,f64], - CCIfSubtarget<"hasXMMInt()", + CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, // Doubles get 8-byte slots that are 8-byte aligned. @@ -399,3 +408,18 @@ def CC_X86 : CallingConv<[ CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>, CCDelegateTo<CC_X86_32> ]>; + +//===----------------------------------------------------------------------===// +// Callee-saved Registers. +//===----------------------------------------------------------------------===// + +def CSR_Ghc : CalleeSavedRegs<(add)>; + +def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>; +def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>; + +def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>; +def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; + +def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, + (sequence "XMM%u", 6, 15))>; diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp index f939510..ee3de9a 100644 --- a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp @@ -1,4 +1,4 @@ -//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===// +//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===// // // The LLVM Compiler Infrastructure // @@ -664,15 +664,14 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, case X86II::A7: // 0F A7 Need0FPrefix = true; break; - case X86II::TF: // F2 0F 38 - MCE.emitByte(0xF2); - Need0FPrefix = true; - break; case X86II::REP: break; // already handled. + case X86II::T8XS: // F3 0F 38 case X86II::XS: // F3 0F MCE.emitByte(0xF3); Need0FPrefix = true; break; + case X86II::T8XD: // F2 0F 38 + case X86II::TAXD: // F2 0F 3A case X86II::XD: // F2 0F MCE.emitByte(0xF2); Need0FPrefix = true; @@ -698,10 +697,12 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, MCE.emitByte(0x0F); switch (Desc->TSFlags & X86II::Op0Mask) { - case X86II::TF: // F2 0F 38 + case X86II::T8XD: // F2 0F 38 + case X86II::T8XS: // F3 0F 38 case X86II::T8: // 0F 38 MCE.emitByte(0x38); break; + case X86II::TAXD: // F2 0F 38 case X86II::TA: // 0F 3A MCE.emitByte(0x3A); break; @@ -805,8 +806,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, } assert(MO.isImm() && "Unknown RawFrm operand!"); - if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32 || - Opcode == X86::WINCALL64pcrel32) { + if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) { // Fix up immediate operand for pc relative calls. intptr_t Imm = (intptr_t)MO.getImm(); Imm = Imm - MCE.getCurrentPCValue() - 4; @@ -1003,7 +1003,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, break; } - if (!Desc->isVariadic() && CurOp != NumOps) { + if (!MI.isVariadic() && CurOp != NumOps) { #ifndef NDEBUG dbgs() << "Cannot encode all operands of: " << MI << "\n"; #endif diff --git a/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp index 4a72d15..c1a49a7 100644 --- a/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp @@ -60,7 +60,6 @@ unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const { llvm_unreachable("unknown x86 machine relocation type"); } } - return 0; } long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, @@ -83,7 +82,6 @@ long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, llvm_unreachable("unknown x86 relocation type"); } } - return 0; } unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const { @@ -107,7 +105,6 @@ unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const { llvm_unreachable("unknown x86 relocation type"); } } - return 0; } bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const { @@ -132,7 +129,6 @@ bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const { llvm_unreachable("unknown x86 relocation type"); } } - return 0; } unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const { @@ -146,8 +142,6 @@ long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset, if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32) return SymOffset - (RelOffset + 4); - else - assert(0 && "computeRelocation unknown for this relocation type"); - return 0; + llvm_unreachable("computeRelocation unknown for this relocation type"); } diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index f912b28..69752c5 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -60,8 +60,8 @@ public: explicit X86FastISel(FunctionLoweringInfo &funcInfo) : FastISel(funcInfo) { Subtarget = &TM.getSubtarget<X86Subtarget>(); StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; - X86ScalarSSEf64 = Subtarget->hasSSE2() || Subtarget->hasAVX(); - X86ScalarSSEf32 = Subtarget->hasSSE1() || Subtarget->hasAVX(); + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); } virtual bool TargetSelectInstruction(const Instruction *I); @@ -258,6 +258,18 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { Opc = X86ScalarSSEf64 ? (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; break; + case MVT::v4f32: + Opc = X86::MOVAPSmr; + break; + case MVT::v2f64: + Opc = X86::MOVAPDmr; + break; + case MVT::v4i32: + case MVT::v2i64: + case MVT::v8i16: + case MVT::v16i8: + Opc = X86::MOVDQAmr; + break; } addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, @@ -671,7 +683,14 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { /// X86SelectStore - Select and emit code to implement store instructions. bool X86FastISel::X86SelectStore(const Instruction *I) { // Atomic stores need special handling. - if (cast<StoreInst>(I)->isAtomic()) + const StoreInst *S = cast<StoreInst>(I); + + if (S->isAtomic()) + return false; + + unsigned SABIAlignment = + TD.getABITypeAlignment(S->getValueOperand()->getType()); + if (S->getAlignment() != 0 && S->getAlignment() < SABIAlignment) return false; MVT VT; @@ -709,7 +728,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && GuaranteedTailCallOpt) + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; // Let SDISel handle vararg functions. @@ -818,8 +837,8 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { bool HasAVX = Subtarget->hasAVX(); - bool X86ScalarSSEf32 = HasAVX || Subtarget->hasSSE1(); - bool X86ScalarSSEf64 = HasAVX || Subtarget->hasSSE2(); + bool X86ScalarSSEf32 = Subtarget->hasSSE1(); + bool X86ScalarSSEf64 = Subtarget->hasSSE2(); switch (VT.getSimpleVT().SimpleTy) { default: return 0; @@ -1510,7 +1529,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && GuaranteedTailCallOpt) + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); @@ -1524,7 +1543,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Fast-isel doesn't know about callee-pop yet. if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg, - GuaranteedTailCallOpt)) + TM.Options.GuaranteedTailCallOpt)) return false; // Check whether the function can return without sret-demotion. @@ -1557,10 +1576,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { SmallVector<unsigned, 8> Args; SmallVector<MVT, 8> ArgVTs; SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; - Args.reserve(CS.arg_size()); - ArgVals.reserve(CS.arg_size()); - ArgVTs.reserve(CS.arg_size()); - ArgFlags.reserve(CS.arg_size()); + unsigned arg_size = CS.arg_size(); + Args.reserve(arg_size); + ArgVals.reserve(arg_size); + ArgVTs.reserve(arg_size); + ArgFlags.reserve(arg_size); for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { // If we're lowering a mem intrinsic instead of a regular call, skip the @@ -1740,9 +1760,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // If this is a really simple value, emit this with the Value* version // of X86FastEmitStore. If it isn't simple, we don't want to do this, // as it can cause us to reevaluate the argument. - X86FastEmitStore(ArgVT, ArgVal, AM); + if (!X86FastEmitStore(ArgVT, ArgVal, AM)) + return false; } else { - X86FastEmitStore(ArgVT, Arg, AM); + if (!X86FastEmitStore(ArgVT, Arg, AM)) + return false; } } } @@ -1757,7 +1779,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) { // Count the number of XMM registers allocated. - static const unsigned XMMArgRegs[] = { + static const uint16_t XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; @@ -1771,9 +1793,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { if (CalleeOp) { // Register-indirect call. unsigned CallOpc; - if (Subtarget->isTargetWin64()) - CallOpc = X86::WINCALL64r; - else if (Subtarget->is64Bit()) + if (Subtarget->is64Bit()) CallOpc = X86::CALL64r; else CallOpc = X86::CALL32r; @@ -1784,9 +1804,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { // Direct call. assert(GV && "Not a direct call"); unsigned CallOpc; - if (Subtarget->isTargetWin64()) - CallOpc = X86::WINCALL64pcrel32; - else if (Subtarget->is64Bit()) + if (Subtarget->is64Bit()) CallOpc = X86::CALL64pcrel32; else CallOpc = X86::CALLpcrel32; @@ -1831,10 +1849,15 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) MIB.addReg(RegArgs[i]); + // Add a register mask with the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); + // Issue CALLSEQ_END unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); unsigned NumBytesCallee = 0; - if (!Subtarget->is64Bit() && CS.paramHasAttr(1, Attribute::StructRet)) + if (!Subtarget->is64Bit() && !Subtarget->isTargetWindows() && + CS.paramHasAttr(1, Attribute::StructRet)) NumBytesCallee = 4; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp)) .addImm(NumBytes).addImm(NumBytesCallee); @@ -2081,7 +2104,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { if (!X86SelectAddress(C, AM)) return 0; unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; - TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); + const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg), AM); @@ -2100,7 +2123,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { default: return false; case MVT::f32: if (X86ScalarSSEf32) { - Opc = Subtarget->hasAVX() ? X86::VFsFLD0SS : X86::FsFLD0SS; + Opc = X86::FsFLD0SS; RC = X86::FR32RegisterClass; } else { Opc = X86::LD_Fp032; @@ -2109,7 +2132,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { break; case MVT::f64: if (X86ScalarSSEf64) { - Opc = Subtarget->hasAVX() ? X86::VFsFLD0SD : X86::FsFLD0SD; + Opc = X86::FsFLD0SD; RC = X86::FR64RegisterClass; } else { Opc = X86::LD_Fp064; @@ -2156,7 +2179,7 @@ bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo, namespace llvm { - llvm::FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo) { + FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo) { return new X86FastISel(funcInfo); } } diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index e3461c8..ed1707d 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -26,8 +26,8 @@ #define DEBUG_TYPE "x86-codegen" #include "X86.h" #include "X86InstrInfo.h" +#include "llvm/InlineAsm.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -37,7 +37,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/InlineAsm.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -219,7 +218,7 @@ namespace { /// getSTReg - Return the X86::ST(i) register which contains the specified /// FP<RegNo> register. unsigned getSTReg(unsigned RegNo) const { - return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0; + return StackTop - 1 - getSlot(RegNo) + X86::ST0; } // pushReg - Push the specified FP<n> register onto the stack. @@ -570,8 +569,8 @@ void FPS::finishBlockStack() { namespace { struct TableEntry { - unsigned from; - unsigned to; + uint16_t from; + uint16_t to; bool operator<(const TableEntry &TE) const { return from < TE.from; } friend bool operator<(const TableEntry &TE, unsigned V) { return TE.from < V; @@ -1644,6 +1643,30 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { return; } + case X86::WIN_FTOL_32: + case X86::WIN_FTOL_64: { + // Push the operand into ST0. + MachineOperand &Op = MI->getOperand(0); + assert(Op.isUse() && Op.isReg() && + Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6); + unsigned FPReg = getFPReg(Op); + if (Op.isKill()) + moveToTop(FPReg, I); + else + duplicateToTop(FPReg, FPReg, I); + + // Emit the call. This will pop the operand. + BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) + .addExternalSymbol("_ftol2") + .addReg(X86::ST0, RegState::ImplicitKill) + .addReg(X86::EAX, RegState::Define | RegState::Implicit) + .addReg(X86::EDX, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + --StackTop; + + break; + } + case X86::RET: case X86::RETI: // If RET has an FP register use operand, pass the first one in ST(0) and diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index d54f4ae..000e375 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1,4 +1,4 @@ -//=======- X86FrameLowering.cpp - X86 Frame Information --------*- C++ -*-====// +//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// // // The LLVM Compiler Infrastructure // @@ -47,7 +47,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineModuleInfo &MMI = MF.getMMI(); const TargetRegisterInfo *RI = TM.getRegisterInfo(); - return (DisableFramePointerElim(MF) || + return (MF.getTarget().Options.DisableFramePointerElim(MF) || RI->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || @@ -79,6 +79,10 @@ static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) { } } +static unsigned getLEArOpcode(unsigned is64Bit) { + return is64Bit ? X86::LEA64r : X86::LEA32r; +} + /// findDeadCallerSavedReg - Return a caller-saved register that isn't live /// when it reaches the "return" instruction. We can then pop a stack object /// to this register without worry about clobbering it. @@ -91,11 +95,11 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, if (!F || MF->getMMI().callsEHReturn()) return 0; - static const unsigned CallerSavedRegs32Bit[] = { + static const uint16_t CallerSavedRegs32Bit[] = { X86::EAX, X86::EDX, X86::ECX, 0 }; - static const unsigned CallerSavedRegs64Bit[] = { + static const uint16_t CallerSavedRegs64Bit[] = { X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, X86::R8, X86::R9, X86::R10, X86::R11, 0 }; @@ -113,7 +117,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, case X86::TCRETURNmi64: case X86::EH_RETURN: case X86::EH_RETURN64: { - SmallSet<unsigned, 8> Uses; + SmallSet<uint16_t, 8> Uses; for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MBBI->getOperand(i); if (!MO.isReg() || MO.isDef()) @@ -121,11 +125,11 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, unsigned Reg = MO.getReg(); if (!Reg) continue; - for (const unsigned *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI) + for (const uint16_t *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI) Uses.insert(*AsI); } - const unsigned *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; + const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; for (; *CS; ++CS) if (!Uses.count(*CS)) return *CS; @@ -141,13 +145,18 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, static void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, int64_t NumBytes, - bool Is64Bit, const TargetInstrInfo &TII, - const TargetRegisterInfo &TRI) { + bool Is64Bit, bool UseLEA, + const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; - unsigned Opc = isSub ? - getSUBriOpcode(Is64Bit, Offset) : - getADDriOpcode(Is64Bit, Offset); + unsigned Opc; + if (UseLEA) + Opc = getLEArOpcode(Is64Bit); + else + Opc = isSub + ? getSUBriOpcode(Is64Bit, Offset) + : getADDriOpcode(Is64Bit, Offset); + uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); @@ -171,13 +180,21 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, } } - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addImm(ThisVal); + MachineInstr *MI = NULL; + + if (UseLEA) { + MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), + StackPtr, false, isSub ? -ThisVal : ThisVal); + } else { + MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(ThisVal); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + } + if (isSub) MI->setFlag(MachineInstr::FrameSetup); - MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + Offset -= ThisVal; } } @@ -191,7 +208,8 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, MachineBasicBlock::iterator PI = prior(MBBI); unsigned Opc = PI->getOpcode(); if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || - Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || + Opc == X86::LEA32r || Opc == X86::LEA64_32r) && PI->getOperand(0).getReg() == StackPtr) { if (NumBytes) *NumBytes += PI->getOperand(2).getImm(); @@ -210,7 +228,7 @@ static void mergeSPUpdatesDown(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, uint64_t *NumBytes = NULL) { - // FIXME: THIS ISN'T RUN!!! + // FIXME: THIS ISN'T RUN!!! return; if (MBBI == MBB.end()) return; @@ -237,8 +255,8 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB, } /// mergeSPUpdates - Checks the instruction before/after the passed -/// instruction. If it is an ADD/SUB instruction it is deleted argument and the -/// stack adjustment is returned as a positive value for ADD and a negative for +/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the +/// stack adjustment is returned as a positive value for ADD/LEA and a negative for /// SUB. static int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, @@ -254,7 +272,8 @@ static int mergeSPUpdates(MachineBasicBlock &MBB, int Offset = 0; if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || - Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || + Opc == X86::LEA32r || Opc == X86::LEA64_32r) && PI->getOperand(0).getReg() == StackPtr){ Offset += PI->getOperand(2).getImm(); MBB.erase(PI); @@ -351,20 +370,22 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, /// register. The number corresponds to the enum lists in /// compact_unwind_encoding.h. static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) { - int Idx = 1; - for (; *CURegs; ++CURegs, ++Idx) + for (int Idx = 1; *CURegs; ++CURegs, ++Idx) if (*CURegs == Reg) return Idx; return -1; } +// Number of registers that can be saved in a compact unwind encoding. +#define CU_NUM_SAVED_REGS 6 + /// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding /// used with frameless stacks. It is passed the number of registers to be saved /// and an array of the registers saved. -static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6], - unsigned RegCount, - bool Is64Bit) { +static uint32_t +encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], + unsigned RegCount, bool Is64Bit) { // The saved registers are numbered from 1 to 6. In order to encode the order // in which they were saved, we re-number them according to their place in the // register order. The re-numbering is relative to the last re-numbered @@ -385,14 +406,21 @@ static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6], }; const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); - uint32_t RenumRegs[6]; - for (unsigned i = 6 - RegCount; i < 6; ++i) { + for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]); if (CUReg == -1) return ~0U; SavedRegs[i] = CUReg; + } + // Reverse the list. + std::swap(SavedRegs[0], SavedRegs[5]); + std::swap(SavedRegs[1], SavedRegs[4]); + std::swap(SavedRegs[2], SavedRegs[3]); + + uint32_t RenumRegs[CU_NUM_SAVED_REGS]; + for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) { unsigned Countless = 0; - for (unsigned j = 6 - RegCount; j < i; ++j) + for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j) if (SavedRegs[j] < SavedRegs[i]) ++Countless; @@ -435,8 +463,9 @@ static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6], /// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a /// compact encoding with a frame pointer. -static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6], - bool Is64Bit) { +static uint32_t +encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], + bool Is64Bit) { static const unsigned CU32BitRegs[] = { X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 }; @@ -446,18 +475,21 @@ static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6], const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); // Encode the registers in the order they were saved, 3-bits per register. The - // registers are numbered from 1 to 6. + // registers are numbered from 1 to CU_NUM_SAVED_REGS. uint32_t RegEnc = 0; - for (int I = 5; I >= 0; --I) { + for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) { unsigned Reg = SavedRegs[I]; - if (Reg == 0) break; + if (Reg == 0) continue; + int CURegNum = getCompactUnwindRegNum(CURegs, Reg); - if (CURegNum == -1) - return ~0U; - RegEnc |= (CURegNum & 0x7) << (5 - I); + if (CURegNum == -1) return ~0U; + + // Encode the 3-bit register number in order, skipping over 3-bits for each + // register. + RegEnc |= (CURegNum & 0x7) << (Idx++ * 3); } - assert((RegEnc & 0x7FFF) == RegEnc && "Invalid compact register encoding!"); + assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!"); return RegEnc; } @@ -466,14 +498,11 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned StackPtr = RegInfo->getStackRegister(); - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); - bool Is64Bit = STI.is64Bit(); bool HasFP = hasFP(MF); - unsigned SavedRegs[6] = { 0, 0, 0, 0, 0, 0 }; - int SavedRegIdx = 6; + unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 }; + unsigned SavedRegIdx = 0; unsigned OffsetSize = (Is64Bit ? 8 : 4); @@ -481,14 +510,13 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { unsigned PushInstrSize = 1; unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); unsigned MoveInstrSize = (Is64Bit ? 3 : 2); - unsigned SubtractInstr = getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta); unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2); unsigned StackDivide = (Is64Bit ? 8 : 4); unsigned InstrOffset = 0; - unsigned CFAOffset = 0; unsigned StackAdjust = 0; + unsigned StackSize = 0; MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB. bool ExpectEnd = false; @@ -504,10 +532,10 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { if (Opc == PushInstr) { // If there are too many saved registers, we cannot use compact encoding. - if (--SavedRegIdx < 0) return 0; + if (SavedRegIdx >= CU_NUM_SAVED_REGS) return 0; - SavedRegs[SavedRegIdx] = MI.getOperand(0).getReg(); - CFAOffset += OffsetSize; + SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg(); + StackAdjust += OffsetSize; InstrOffset += PushInstrSize; } else if (Opc == MoveInstr) { unsigned SrcReg = MI.getOperand(1).getReg(); @@ -516,12 +544,14 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { if (DstReg != FramePtr || SrcReg != StackPtr) return 0; - CFAOffset = 0; + StackAdjust = 0; memset(SavedRegs, 0, sizeof(SavedRegs)); + SavedRegIdx = 0; InstrOffset += MoveInstrSize; - } else if (Opc == SubtractInstr) { - if (StackAdjust) - // We all ready have a stack pointer adjustment. + } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) { + if (StackSize) + // We already have a stack size. return 0; if (!MI.getOperand(0).isReg() || @@ -532,7 +562,7 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { // %RSP<def> = SUB64ri8 %RSP, 48 return 0; - StackAdjust = MI.getOperand(2).getImm() / StackDivide; + StackSize = MI.getOperand(2).getImm() / StackDivide; SubtractInstrIdx += InstrOffset; ExpectEnd = true; } @@ -540,28 +570,30 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { // Encode that we are using EBP/RBP as the frame pointer. uint32_t CompactUnwindEncoding = 0; - CFAOffset /= StackDivide; + StackAdjust /= StackDivide; if (HasFP) { - if ((CFAOffset & 0xFF) != CFAOffset) + if ((StackAdjust & 0xFF) != StackAdjust) // Offset was too big for compact encoding. return 0; // Get the encoding of the saved registers when we have a frame pointer. uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit); - if (RegEnc == ~0U) - return 0; + if (RegEnc == ~0U) return 0; CompactUnwindEncoding |= 0x01000000; - CompactUnwindEncoding |= (CFAOffset & 0xFF) << 16; + CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16; CompactUnwindEncoding |= RegEnc & 0x7FFF; } else { - unsigned FullOffset = CFAOffset + StackAdjust; - if ((FullOffset & 0xFF) == FullOffset) { - // Frameless stack. + ++StackAdjust; + uint32_t TotalStackSize = StackAdjust + StackSize; + if ((TotalStackSize & 0xFF) == TotalStackSize) { + // Frameless stack with a small stack size. CompactUnwindEncoding |= 0x02000000; - CompactUnwindEncoding |= (FullOffset & 0xFF) << 16; + + // Encode the stack size. + CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16; } else { - if ((CFAOffset & 0x7) != CFAOffset) + if ((StackAdjust & 0x7) != StackAdjust) // The extra stack adjustments are too big for us to handle. return 0; @@ -572,16 +604,21 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { // instruction. CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16; - // Encode any extra stack stack changes (done via push instructions). - CompactUnwindEncoding |= (CFAOffset & 0x7) << 13; + // Encode any extra stack stack adjustments (done via push instructions). + CompactUnwindEncoding |= (StackAdjust & 0x7) << 13; } + // Encode the number of registers saved. + CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10; + // Get the encoding of the saved registers when we don't have a frame // pointer. - uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegs, - 6 - SavedRegIdx, - Is64Bit); + uint32_t RegEnc = + encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx, + Is64Bit); if (RegEnc == ~0U) return 0; + + // Encode the register encoding. CompactUnwindEncoding |= RegEnc & 0x3FF; } @@ -608,6 +645,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { bool HasFP = hasFP(MF); bool Is64Bit = STI.is64Bit(); bool IsWin64 = STI.isTargetWin64(); + bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); @@ -637,10 +675,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // stack pointer (we fit in the Red Zone). if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) && !RegInfo->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64 && // Win64 has no Red Zone - !EnableSegmentedStacks) { // Regular stack + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64 && // Win64 has no Red Zone + !MF.getTarget().Options.EnableSegmentedStacks) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); @@ -861,7 +899,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // FIXME: %rax preserves the offset and should be available. if (isSPUpdateNeeded) emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, - TII, *RegInfo); + UseLEA, TII, *RegInfo); if (isEAXAlive) { // Restore EAX @@ -873,7 +911,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { } } else if (NumBytes) emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, - TII, *RegInfo); + UseLEA, TII, *RegInfo); if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { // Mark end of stack pointer adjustment. @@ -917,6 +955,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); bool Is64Bit = STI.is64Bit(); + bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); @@ -977,7 +1016,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned Opc = PI->getOpcode(); if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && - !PI->getDesc().isTerminator()) + !PI->isTerminator()) break; --MBBI; @@ -997,7 +1036,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // We cannot use LEA here, because stack pointer was realigned. We need to // deallocate local frame back. if (CSSize) { - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII, *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII, + *RegInfo); MBBI = prior(LastCSPop); } @@ -1018,7 +1058,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII, *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII, *RegInfo); } // We're returning from function via eh_return. @@ -1053,7 +1093,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, UseLEA, TII, *RegInfo); } // Jump to label or value in register. @@ -1097,7 +1137,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, UseLEA, TII, *RegInfo); } } @@ -1280,31 +1320,35 @@ HasNestArgument(const MachineFunction *MF) { return false; } + +/// GetScratchRegister - Get a register for performing work in the segmented +/// stack prologue. Depending on platform and the properties of the function +/// either one or two registers will be needed. Set primary to true for +/// the first register, false for the second. static unsigned -GetScratchRegister(bool Is64Bit, const MachineFunction &MF) { - if (Is64Bit) { - return X86::R11; - } else { - CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); - bool IsNested = HasNestArgument(&MF); - - if (CallingConvention == CallingConv::X86_FastCall) { - if (IsNested) { - report_fatal_error("Segmented stacks does not support fastcall with " - "nested function."); - return -1; - } else { - return X86::EAX; - } - } else { - if (IsNested) - return X86::EDX; - else - return X86::ECX; - } +GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { + if (Is64Bit) + return Primary ? X86::R11 : X86::R12; + + CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); + bool IsNested = HasNestArgument(&MF); + + if (CallingConvention == CallingConv::X86_FastCall || + CallingConvention == CallingConv::Fast) { + if (IsNested) + report_fatal_error("Segmented stacks does not support fastcall with " + "nested function."); + return Primary ? X86::EAX : X86::ECX; } + if (IsNested) + return Primary ? X86::EDX : X86::EAX; + return Primary ? X86::ECX : X86::EAX; } +// The stack limit in the TCB is set to this many bytes above the actual stack +// limit. +static const uint64_t kSplitStackAvailable = 256; + void X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MachineBasicBlock &prologueMBB = MF.front(); @@ -1316,14 +1360,15 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { DebugLoc DL; const X86Subtarget *ST = &MF.getTarget().getSubtarget<X86Subtarget>(); - unsigned ScratchReg = GetScratchRegister(Is64Bit, MF); + unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); if (MF.getFunction()->isVarArg()) report_fatal_error("Segmented stacks do not support vararg functions."); - if (!ST->isTargetLinux()) - report_fatal_error("Segmented stacks supported only on linux."); + if (!ST->isTargetLinux() && !ST->isTargetDarwin() && + !ST->isTargetWin32() && !ST->isTargetFreeBSD()) + report_fatal_error("Segmented stacks not supported on this platform."); MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); @@ -1336,26 +1381,16 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. - MachineBasicBlock *restoreR10MBB = NULL; - if (IsNested) - restoreR10MBB = MF.CreateMachineBasicBlock(); for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), e = prologueMBB.livein_end(); i != e; i++) { allocMBB->addLiveIn(*i); checkMBB->addLiveIn(*i); - - if (IsNested) - restoreR10MBB->addLiveIn(*i); } - if (IsNested) { + if (IsNested) allocMBB->addLiveIn(X86::R10); - restoreR10MBB->addLiveIn(X86::RAX); - } - if (IsNested) - MF.push_front(restoreR10MBB); MF.push_front(allocMBB); MF.push_front(checkMBB); @@ -1364,28 +1399,99 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // prologue. StackSize = MFI->getStackSize(); + // When the frame size is less than 256 we just compare the stack + // boundary directly to the value of the stack pointer, per gcc. + bool CompareStackPointer = StackSize < kSplitStackAvailable; + // Read the limit off the current stacklet off the stack_guard location. if (Is64Bit) { - TlsReg = X86::FS; - TlsOffset = 0x70; + if (ST->isTargetLinux()) { + TlsReg = X86::FS; + TlsOffset = 0x70; + } else if (ST->isTargetDarwin()) { + TlsReg = X86::GS; + TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. + } else if (ST->isTargetFreeBSD()) { + TlsReg = X86::FS; + TlsOffset = 0x18; + } else { + report_fatal_error("Segmented stacks not supported on this platform."); + } + + if (CompareStackPointer) + ScratchReg = X86::RSP; + else + BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) + .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) - .addImm(0).addReg(0).addImm(-StackSize).addReg(0); BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg) - .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else { - TlsReg = X86::GS; - TlsOffset = 0x30; + if (ST->isTargetLinux()) { + TlsReg = X86::GS; + TlsOffset = 0x30; + } else if (ST->isTargetDarwin()) { + TlsReg = X86::GS; + TlsOffset = 0x48 + 90*4; + } else if (ST->isTargetWin32()) { + TlsReg = X86::FS; + TlsOffset = 0x14; // pvArbitrary, reserved for application use + } else if (ST->isTargetFreeBSD()) { + report_fatal_error("Segmented stacks not supported on FreeBSD i386."); + } else { + report_fatal_error("Segmented stacks not supported on this platform."); + } + + if (CompareStackPointer) + ScratchReg = X86::ESP; + else + BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) + .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + + if (ST->isTargetLinux() || ST->isTargetWin32()) { + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) + .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } else if (ST->isTargetDarwin()) { + + // TlsOffset doesn't fit into a mod r/m byte so we need an extra register + unsigned ScratchReg2; + bool SaveScratch2; + if (CompareStackPointer) { + // The primary scratch register is available for holding the TLS offset + ScratchReg2 = GetScratchRegister(Is64Bit, MF, true); + SaveScratch2 = false; + } else { + // Need to use a second register to hold the TLS offset + ScratchReg2 = GetScratchRegister(Is64Bit, MF, false); + + // Unfortunately, with fastcc the second scratch register may hold an arg + SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); + } - BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) - .addImm(0).addReg(0).addImm(-StackSize).addReg(0); - BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) - .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + // If Scratch2 is live-in then it needs to be saved + assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && + "Scratch register is live-in and not saved"); + + if (SaveScratch2) + BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) + .addReg(ScratchReg2, RegState::Kill); + + BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) + .addImm(TlsOffset); + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) + .addReg(ScratchReg) + .addReg(ScratchReg2).addImm(1).addReg(0) + .addImm(0) + .addReg(TlsReg); + + if (SaveScratch2) + BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); + } } // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. - BuildMI(checkMBB, DL, TII.get(X86::JG_4)).addMBB(&prologueMBB); + BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. @@ -1403,9 +1509,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MF.getRegInfo().setPhysRegUsed(X86::R10); MF.getRegInfo().setPhysRegUsed(X86::R11); } else { - // Since we'll call __morestack, stack alignment needs to be preserved. - BuildMI(allocMBB, DL, TII.get(X86::SUB32ri), X86::ESP).addReg(X86::ESP) - .addImm(8); BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) @@ -1420,23 +1523,12 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__morestack"); - // __morestack only seems to remove 8 bytes off the stack. Add back the - // additional 8 bytes we added before pushing the arguments. - if (!Is64Bit) - BuildMI(allocMBB, DL, TII.get(X86::ADD32ri), X86::ESP).addReg(X86::ESP) - .addImm(8); - BuildMI(allocMBB, DL, TII.get(X86::RET)); - if (IsNested) - BuildMI(restoreR10MBB, DL, TII.get(X86::MOV64rr), X86::R10) - .addReg(X86::RAX); + BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); + else + BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); - if (IsNested) { - allocMBB->addSuccessor(restoreR10MBB); - restoreR10MBB->addSuccessor(&prologueMBB); - } else { - allocMBB->addSuccessor(&prologueMBB); - } + allocMBB->addSuccessor(&prologueMBB); checkMBB->addSuccessor(allocMBB); checkMBB->addSuccessor(&prologueMBB); diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 6f49064..d55a497 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -1,4 +1,4 @@ -//=-- X86TargetFrameLowering.h - Define frame lowering for X86 ---*- C++ -*-===// +//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 02b0ff2..8e2b1d6 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -21,7 +21,6 @@ #include "X86TargetMachine.h" #include "llvm/Instructions.h" #include "llvm/Intrinsics.h" -#include "llvm/Support/CFG.h" #include "llvm/Type.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -32,11 +31,11 @@ #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" using namespace llvm; @@ -540,7 +539,7 @@ void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, const TargetInstrInfo *TII = TM.getInstrInfo(); if (Subtarget->isTargetCygMing()) { unsigned CallOp = - Subtarget->is64Bit() ? X86::WINCALL64pcrel32 : X86::CALLpcrel32; + Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32; BuildMI(BB, DebugLoc(), TII->get(CallOp)).addExternalSymbol("__main"); } @@ -621,14 +620,14 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { // Handle X86-64 rip-relative addresses. We check this before checking direct // folding because RIP is preferable to non-RIP accesses. - if (Subtarget->is64Bit() && + if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP && // Under X86-64 non-small code model, GV (and friends) are 64-bits, so // they cannot be folded into immediate fields. // FIXME: This can be improved for kernel and other models? - (M == CodeModel::Small || M == CodeModel::Kernel) && - // Base and index reg must be 0 in order to use %rip as base and lowering - // must allow RIP. - !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) { + (M == CodeModel::Small || M == CodeModel::Kernel)) { + // Base and index reg must be 0 in order to use %rip as base. + if (AM.hasBaseOrIndexReg()) + return true; if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { X86ISelAddressMode Backup = AM; AM.GV = G->getGlobal(); @@ -663,11 +662,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { } // Handle the case when globals fit in our immediate field: This is true for - // X86-32 always and X86-64 when in -static -mcmodel=small mode. In 64-bit - // mode, this results in a non-RIP-relative computation. + // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit + // mode, this only applies to a non-RIP-relative computation. if (!Subtarget->is64Bit() || - ((M == CodeModel::Small || M == CodeModel::Kernel) && - TM.getRelocationModel() == Reloc::Static)) { + M == CodeModel::Small || M == CodeModel::Kernel) { + assert(N.getOpcode() != X86ISD::WrapperRIP && + "RIP-relative addressing already handled"); if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { AM.GV = G->getGlobal(); AM.Disp += G->getOffset(); @@ -725,6 +725,213 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { return false; } +// Insert a node into the DAG at least before the Pos node's position. This +// will reposition the node as needed, and will assign it a node ID that is <= +// the Pos node's ID. Note that this does *not* preserve the uniqueness of node +// IDs! The selection DAG must no longer depend on their uniqueness when this +// is used. +static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { + if (N.getNode()->getNodeId() == -1 || + N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { + DAG.RepositionNode(Pos.getNode(), N.getNode()); + N.getNode()->setNodeId(Pos.getNode()->getNodeId()); + } +} + +// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This +// allows us to convert the shift and and into an h-register extract and +// a scaled index. Returns false if the simplification is performed. +static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SRL || + !isa<ConstantSDNode>(Shift.getOperand(1)) || + !Shift.hasOneUse()) + return true; + + int ScaleLog = 8 - Shift.getConstantOperandVal(1); + if (ScaleLog <= 0 || ScaleLog >= 4 || + Mask != (0xffu << ScaleLog)) + return true; + + EVT VT = N.getValueType(); + DebugLoc DL = N.getDebugLoc(); + SDValue Eight = DAG.getConstant(8, MVT::i8); + SDValue NewMask = DAG.getConstant(0xff, VT); + SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); + SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8); + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, Eight); + InsertDAGNode(DAG, N, Srl); + InsertDAGNode(DAG, N, NewMask); + InsertDAGNode(DAG, N, And); + InsertDAGNode(DAG, N, ShlCount); + InsertDAGNode(DAG, N, Shl); + DAG.ReplaceAllUsesWith(N, Shl); + AM.IndexReg = And; + AM.Scale = (1 << ScaleLog); + return false; +} + +// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this +// allows us to fold the shift into this addressing mode. Returns false if the +// transform succeeded. +static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SHL || + !isa<ConstantSDNode>(Shift.getOperand(1))) + return true; + + // Not likely to be profitable if either the AND or SHIFT node has more + // than one use (unless all uses are for address computation). Besides, + // isel mechanism requires their node ids to be reused. + if (!N.hasOneUse() || !Shift.hasOneUse()) + return true; + + // Verify that the shift amount is something we can fold. + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) + return true; + + EVT VT = N.getValueType(); + DebugLoc DL = N.getDebugLoc(); + SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); + SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, NewMask); + InsertDAGNode(DAG, N, NewAnd); + InsertDAGNode(DAG, N, NewShift); + DAG.ReplaceAllUsesWith(N, NewShift); + + AM.Scale = 1 << ShiftAmt; + AM.IndexReg = NewAnd; + return false; +} + +// Implement some heroics to detect shifts of masked values where the mask can +// be replaced by extending the shift and undoing that in the addressing mode +// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and +// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in +// the addressing mode. This results in code such as: +// +// int f(short *y, int *lookup_table) { +// ... +// return *y + lookup_table[*y >> 11]; +// } +// +// Turning into: +// movzwl (%rdi), %eax +// movl %eax, %ecx +// shrl $11, %ecx +// addl (%rsi,%rcx,4), %eax +// +// Instead of: +// movzwl (%rdi), %eax +// movl %eax, %ecx +// shrl $9, %ecx +// andl $124, %rcx +// addl (%rsi,%rcx), %eax +// +// Note that this function assumes the mask is provided as a mask *after* the +// value is shifted. The input chain may or may not match that, but computing +// such a mask is trivial. +static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || + !isa<ConstantSDNode>(Shift.getOperand(1))) + return true; + + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + unsigned MaskLZ = CountLeadingZeros_64(Mask); + unsigned MaskTZ = CountTrailingZeros_64(Mask); + + // The amount of shift we're trying to fit into the addressing mode is taken + // from the trailing zeros of the mask. + unsigned AMShiftAmt = MaskTZ; + + // There is nothing we can do here unless the mask is removing some bits. + // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. + if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; + + // We also need to ensure that mask is a continuous run of bits. + if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; + + // Scale the leading zero count down based on the actual size of the value. + // Also scale it down based on the size of the shift. + MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt; + + // The final check is to ensure that any masked out high bits of X are + // already known to be zero. Otherwise, the mask has a semantic impact + // other than masking out a couple of low bits. Unfortunately, because of + // the mask, zero extensions will be removed from operands in some cases. + // This code works extra hard to look through extensions because we can + // replace them with zero extensions cheaply if necessary. + bool ReplacingAnyExtend = false; + if (X.getOpcode() == ISD::ANY_EXTEND) { + unsigned ExtendBits = + X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits(); + // Assume that we'll replace the any-extend with a zero-extend, and + // narrow the search to the extended value. + X = X.getOperand(0); + MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; + ReplacingAnyExtend = true; + } + APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(), + MaskLZ); + APInt KnownZero, KnownOne; + DAG.ComputeMaskedBits(X, KnownZero, KnownOne); + if (MaskedHighBits != KnownZero) return true; + + // We've identified a pattern that can be transformed into a single shift + // and an addressing mode. Make it so. + EVT VT = N.getValueType(); + if (ReplacingAnyExtend) { + assert(X.getValueType() != VT); + // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. + SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X); + InsertDAGNode(DAG, N, NewX); + X = NewX; + } + DebugLoc DL = N.getDebugLoc(); + SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8); + SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); + SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8); + SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + InsertDAGNode(DAG, N, NewSRLAmt); + InsertDAGNode(DAG, N, NewSRL); + InsertDAGNode(DAG, N, NewSHLAmt); + InsertDAGNode(DAG, N, NewSHL); + DAG.ReplaceAllUsesWith(N, NewSHL); + + AM.Scale = 1 << AMShiftAmt; + AM.IndexReg = NewSRL; + return false; +} + bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { DebugLoc dl = N.getDebugLoc(); @@ -814,6 +1021,33 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, break; } + case ISD::SRL: { + // Scale must not be used already. + if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + + SDValue And = N.getOperand(0); + if (And.getOpcode() != ISD::AND) break; + SDValue X = And.getOperand(0); + + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + if (X.getValueSizeInBits() > 64) break; + + // The mask used for the transform is expected to be post-shift, but we + // found the shift first so just apply the shift to the mask before passing + // it down. + if (!isa<ConstantSDNode>(N.getOperand(1)) || + !isa<ConstantSDNode>(And.getOperand(1))) + break; + uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); + + // Try to fold the mask and shift into the scale, and return false if we + // succeed. + if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) + return false; + break; + } + case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: // A mul_lohi where we need the low part can be folded as a plain multiply. @@ -917,16 +1151,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.Scale = 1; // Insert the new nodes into the topological ordering. - if (Zero.getNode()->getNodeId() == -1 || - Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), Zero.getNode()); - Zero.getNode()->setNodeId(N.getNode()->getNodeId()); - } - if (Neg.getNode()->getNodeId() == -1 || - Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), Neg.getNode()); - Neg.getNode()->setNodeId(N.getNode()->getNodeId()); - } + InsertDAGNode(*CurDAG, N, Zero); + InsertDAGNode(*CurDAG, N, Neg); return false; } @@ -981,121 +1207,34 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Perform some heroic transforms on an and of a constant-count shift // with a constant to enable use of the scaled offset field. - SDValue Shift = N.getOperand(0); - if (Shift.getNumOperands() != 2) break; - // Scale must not be used already. if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + SDValue Shift = N.getOperand(0); + if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break; SDValue X = Shift.getOperand(0); - ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1)); - ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); - if (!C1 || !C2) break; - - // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This - // allows us to convert the shift and and into an h-register extract and - // a scaled index. - if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) { - unsigned ScaleLog = 8 - C1->getZExtValue(); - if (ScaleLog > 0 && ScaleLog < 4 && - C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) { - SDValue Eight = CurDAG->getConstant(8, MVT::i8); - SDValue Mask = CurDAG->getConstant(0xff, N.getValueType()); - SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(), - X, Eight); - SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(), - Srl, Mask); - SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8); - SDValue Shl = CurDAG->getNode(ISD::SHL, dl, N.getValueType(), - And, ShlCount); - - // Insert the new nodes into the topological ordering. - if (Eight.getNode()->getNodeId() == -1 || - Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), Eight.getNode()); - Eight.getNode()->setNodeId(X.getNode()->getNodeId()); - } - if (Mask.getNode()->getNodeId() == -1 || - Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), Mask.getNode()); - Mask.getNode()->setNodeId(X.getNode()->getNodeId()); - } - if (Srl.getNode()->getNodeId() == -1 || - Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) { - CurDAG->RepositionNode(Shift.getNode(), Srl.getNode()); - Srl.getNode()->setNodeId(Shift.getNode()->getNodeId()); - } - if (And.getNode()->getNodeId() == -1 || - And.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), And.getNode()); - And.getNode()->setNodeId(N.getNode()->getNodeId()); - } - if (ShlCount.getNode()->getNodeId() == -1 || - ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), ShlCount.getNode()); - ShlCount.getNode()->setNodeId(N.getNode()->getNodeId()); - } - if (Shl.getNode()->getNodeId() == -1 || - Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), Shl.getNode()); - Shl.getNode()->setNodeId(N.getNode()->getNodeId()); - } - CurDAG->ReplaceAllUsesWith(N, Shl); - AM.IndexReg = And; - AM.Scale = (1 << ScaleLog); - return false; - } - } - // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this - // allows us to fold the shift into this addressing mode. - if (Shift.getOpcode() != ISD::SHL) break; + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + if (X.getValueSizeInBits() > 64) break; - // Not likely to be profitable if either the AND or SHIFT node has more - // than one use (unless all uses are for address computation). Besides, - // isel mechanism requires their node ids to be reused. - if (!N.hasOneUse() || !Shift.hasOneUse()) - break; - - // Verify that the shift amount is something we can fold. - unsigned ShiftCst = C1->getZExtValue(); - if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3) + if (!isa<ConstantSDNode>(N.getOperand(1))) break; - - // Get the new AND mask, this folds to a constant. - SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(), - SDValue(C2, 0), SDValue(C1, 0)); - SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X, - NewANDMask); - SDValue NewSHIFT = CurDAG->getNode(ISD::SHL, dl, N.getValueType(), - NewAND, SDValue(C1, 0)); + uint64_t Mask = N.getConstantOperandVal(1); - // Insert the new nodes into the topological ordering. - if (C1->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), C1); - C1->setNodeId(X.getNode()->getNodeId()); - } - if (NewANDMask.getNode()->getNodeId() == -1 || - NewANDMask.getNode()->getNodeId() > X.getNode()->getNodeId()) { - CurDAG->RepositionNode(X.getNode(), NewANDMask.getNode()); - NewANDMask.getNode()->setNodeId(X.getNode()->getNodeId()); - } - if (NewAND.getNode()->getNodeId() == -1 || - NewAND.getNode()->getNodeId() > Shift.getNode()->getNodeId()) { - CurDAG->RepositionNode(Shift.getNode(), NewAND.getNode()); - NewAND.getNode()->setNodeId(Shift.getNode()->getNodeId()); - } - if (NewSHIFT.getNode()->getNodeId() == -1 || - NewSHIFT.getNode()->getNodeId() > N.getNode()->getNodeId()) { - CurDAG->RepositionNode(N.getNode(), NewSHIFT.getNode()); - NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId()); - } + // Try to fold the mask and shift into an extract and scale. + if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + return false; - CurDAG->ReplaceAllUsesWith(N, NewSHIFT); - - AM.Scale = 1 << ShiftCst; - AM.IndexReg = NewAND; - return false; + // Try to fold the mask and shift directly into the scale. + if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to swap the mask and shift to place shifts which can be done as + // a scale on the outside of the mask. + if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) + return false; + break; } } @@ -1515,7 +1654,7 @@ enum AtomicSz { AtomicSzEnd }; -static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { +static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { { X86::LOCK_OR8mi, X86::LOCK_OR8mr, @@ -1709,6 +1848,96 @@ static bool HasNoSignedComparisonUses(SDNode *N) { return true; } +/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode +/// is suitable for doing the {load; increment or decrement; store} to modify +/// transformation. +static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, + SDValue StoredVal, SelectionDAG *CurDAG, + LoadSDNode* &LoadNode, SDValue &InputChain) { + + // is the value stored the result of a DEC or INC? + if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false; + + // is the stored value result 0 of the load? + if (StoredVal.getResNo() != 0) return false; + + // are there other uses of the loaded value than the inc or dec? + if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; + + // is the store non-extending and non-indexed? + if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) + return false; + + SDValue Load = StoredVal->getOperand(0); + // Is the stored value a non-extending and non-indexed load? + if (!ISD::isNormalLoad(Load.getNode())) return false; + + // Return LoadNode by reference. + LoadNode = cast<LoadSDNode>(Load); + // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8) + EVT LdVT = LoadNode->getMemoryVT(); + if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && + LdVT != MVT::i8) + return false; + + // Is store the only read of the loaded value? + if (!Load.hasOneUse()) + return false; + + // Is the address of the store the same as the load? + if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || + LoadNode->getOffset() != StoreNode->getOffset()) + return false; + + // Check if the chain is produced by the load or is a TokenFactor with + // the load output chain as an operand. Return InputChain by reference. + SDValue Chain = StoreNode->getChain(); + + bool ChainCheck = false; + if (Chain == Load.getValue(1)) { + ChainCheck = true; + InputChain = LoadNode->getChain(); + } else if (Chain.getOpcode() == ISD::TokenFactor) { + SmallVector<SDValue, 4> ChainOps; + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { + SDValue Op = Chain.getOperand(i); + if (Op == Load.getValue(1)) { + ChainCheck = true; + continue; + } + ChainOps.push_back(Op); + } + + if (ChainCheck) + // Make a new TokenFactor with all the other input chains except + // for the load. + InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(), + MVT::Other, &ChainOps[0], ChainOps.size()); + } + if (!ChainCheck) + return false; + + return true; +} + +/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory +/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC. +static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { + if (Opc == X86ISD::DEC) { + if (LdVT == MVT::i64) return X86::DEC64m; + if (LdVT == MVT::i32) return X86::DEC32m; + if (LdVT == MVT::i16) return X86::DEC16m; + if (LdVT == MVT::i8) return X86::DEC8m; + } else { + assert(Opc == X86ISD::INC && "unrecognized opcode"); + if (LdVT == MVT::i64) return X86::INC64m; + if (LdVT == MVT::i32) return X86::INC32m; + if (LdVT == MVT::i16) return X86::INC16m; + if (LdVT == MVT::i8) return X86::INC8m; + } + llvm_unreachable("unrecognized size for LdVT"); +} + SDNode *X86DAGToDAGISel::Select(SDNode *Node) { EVT NVT = Node->getValueType(0); unsigned Opc, MOpc; @@ -1829,7 +2058,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), getI8Imm(ShlVal)); - break; } case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); @@ -2114,7 +2342,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { HasNoSignedComparisonUses(Node)) // Look past the truncate if CMP is the only use of it. N0 = N0.getOperand(0); - if (N0.getNode()->getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && + if ((N0.getNode()->getOpcode() == ISD::AND || + (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) && + N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8 && X86::isZeroNode(N1)) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1)); @@ -2129,7 +2359,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // On x86-32, only the ABCD registers have 8-bit subregisters. if (!Subtarget->is64Bit()) { - TargetRegisterClass *TRC = 0; + const TargetRegisterClass *TRC; switch (N0.getValueType().getSimpleVT().SimpleTy) { case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; @@ -2158,7 +2388,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Reg = N0.getNode()->getOperand(0); // Put the value in an ABCD register. - TargetRegisterClass *TRC = 0; + const TargetRegisterClass *TRC; switch (N0.getValueType().getSimpleVT().SimpleTy) { case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break; case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; @@ -2214,6 +2444,56 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } break; } + case ISD::STORE: { + // Change a chain of {load; incr or dec; store} of the same value into + // a simple increment or decrement through memory of that value, if the + // uses of the modified value and its address are suitable. + // The DEC64m tablegen pattern is currently not able to match the case where + // the EFLAGS on the original DEC are used. (This also applies to + // {INC,DEC}X{64,32,16,8}.) + // We'll need to improve tablegen to allow flags to be transferred from a + // node in the pattern to the result node. probably with a new keyword + // for example, we have this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (implicit EFLAGS)]>; + // but maybe need something like this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (transferrable EFLAGS)]>; + + StoreSDNode *StoreNode = cast<StoreSDNode>(Node); + SDValue StoredVal = StoreNode->getOperand(1); + unsigned Opc = StoredVal->getOpcode(); + + LoadSDNode *LoadNode = 0; + SDValue InputChain; + if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG, + LoadNode, InputChain)) + break; + + SDValue Base, Scale, Index, Disp, Segment; + if (!SelectAddr(LoadNode, LoadNode->getBasePtr(), + Base, Scale, Index, Disp, Segment)) + break; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2); + MemOp[0] = StoreNode->getMemOperand(); + MemOp[1] = LoadNode->getMemOperand(); + const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain }; + EVT LdVT = LoadNode->getMemoryVT(); + unsigned newOpc = getFusedLdStOpcode(LdVT, Opc); + MachineSDNode *Result = CurDAG->getMachineNode(newOpc, + Node->getDebugLoc(), + MVT::i32, MVT::Other, Ops, + array_lengthof(Ops)); + Result->setMemRefs(MemOp, MemOp + 2); + + ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); + ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); + + return Result; + } } SDNode *ResNode = SelectCode(Node); @@ -2254,6 +2534,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, /// X86-specific DAG, ready for instruction scheduling. /// FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, - llvm::CodeGenOpt::Level OptLevel) { + CodeGenOpt::Level OptLevel) { return new X86DAGToDAGISel(TM, OptLevel); } diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 7c8ce17..9b83aad 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -13,9 +13,9 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "x86-isel" +#include "X86ISelLowering.h" #include "X86.h" #include "X86InstrBuilder.h" -#include "X86ISelLowering.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "Utils/X86ShuffleDecode.h" @@ -35,25 +35,21 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/VectorExtras.h" +#include "llvm/ADT/VariadicFunction.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" +#include <bitset> using namespace llvm; -using namespace dwarf; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -61,17 +57,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2); -static SDValue Insert128BitVector(SDValue Result, - SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, - DebugLoc dl); - -static SDValue Extract128BitVector(SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, - DebugLoc dl); - /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 instruction or a /// simple subregister reference. Idx is an index in the 128 bits we @@ -169,8 +154,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget<X86Subtarget>(); - X86ScalarSSEf64 = Subtarget->hasXMMInt(); - X86ScalarSSEf32 = Subtarget->hasXMM(); + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); @@ -186,8 +171,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. + // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling. if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); + else if (Subtarget->isAtom()) + setSchedulingPreference(Sched::Hybrid); else setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); @@ -199,15 +187,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); - setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); - setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); - setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); - setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); + + // The _ftol2 runtime function has an unusual calling conv, which + // is modeled by a special pseudo-instruction. + setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); + setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); + setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); + setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); } if (Subtarget->isTargetDarwin()) { @@ -256,8 +247,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); - } else if (!UseSoftFloat) { + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + } else if (!TM.Options.UseSoftFloat) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); @@ -271,7 +262,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - if (!UseSoftFloat) { + if (!TM.Options.UseSoftFloat) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); @@ -314,7 +305,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. @@ -327,6 +318,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); } + if (isTargetFTOL()) { + // Use the _ftol2 runtime function, which has a pseudo-instruction + // to handle its weird calling convention. + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } + // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); @@ -379,10 +376,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + // Promote the i8 variants and force them on up to i32 which has a shorter + // encoding. + setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); if (Subtarget->hasBMI()) { - setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); } else { - setOperationAction(ISD::CTTZ , MVT::i8 , Custom); setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) @@ -390,13 +395,27 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } if (Subtarget->hasLZCNT()) { + // When promoting the i8 variants, force them to i32 for a shorter + // encoding. setOperationAction(ISD::CTLZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); - if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); + if (Subtarget->is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + } } if (Subtarget->hasPOPCNT()) { @@ -459,7 +478,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); @@ -538,14 +557,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); - else if (EnableSegmentedStacks) + else if (TM.Options.EnableSegmentedStacks) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Expand); - if (!UseSoftFloat && X86ScalarSSEf64) { + if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -577,7 +596,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - } else if (!UseSoftFloat && X86ScalarSSEf32) { + } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -606,11 +625,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, X86::RFP64RegisterClass); @@ -621,7 +640,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } @@ -640,7 +659,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87. - if (!UseSoftFloat) { + if (!TM.Options.UseSoftFloat) { addRegisterClass(MVT::f80, X86::RFP80RegisterClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -659,11 +678,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f80 , Expand); setOperationAction(ISD::FCOS , MVT::f80 , Expand); } + setOperationAction(ISD::FFLOOR, MVT::f80, Expand); + setOperationAction(ISD::FCEIL, MVT::f80, Expand); + setOperationAction(ISD::FTRUNC, MVT::f80, Expand); + setOperationAction(ISD::FRINT, MVT::f80, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } @@ -715,7 +739,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); @@ -749,7 +775,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. - if (!UseSoftFloat && Subtarget->hasMMX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); // No operations on x86mmx supported, everything uses intrinsics. } @@ -786,7 +812,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); - if (!UseSoftFloat && Subtarget->hasXMM()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -803,7 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC, MVT::v4f32, Custom); } - if (!UseSoftFloat && Subtarget->hasXMMInt()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM @@ -909,7 +935,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); } - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { + if (Subtarget->hasSSE41()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); @@ -924,10 +950,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - // Can turn SHL into an integer multiply. - setOperationAction(ISD::SHL, MVT::v4i32, Custom); - setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); @@ -948,30 +970,47 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + // FIXME: these should be Legal but thats only for the case where + // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } } - if (Subtarget->hasXMMInt()) { - setOperationAction(ISD::SRL, MVT::v2i64, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Custom); - setOperationAction(ISD::SRL, MVT::v16i8, Custom); + if (Subtarget->hasSSE2()) { setOperationAction(ISD::SRL, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); - setOperationAction(ISD::SHL, MVT::v2i64, Custom); - setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v8i16, Custom); + setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::SRA, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v8i16, Custom); + setOperationAction(ISD::SRA, MVT::v16i8, Custom); + + if (Subtarget->hasAVX2()) { + setOperationAction(ISD::SRL, MVT::v2i64, Legal); + setOperationAction(ISD::SRL, MVT::v4i32, Legal); + + setOperationAction(ISD::SHL, MVT::v2i64, Legal); + setOperationAction(ISD::SHL, MVT::v4i32, Legal); + + setOperationAction(ISD::SRA, MVT::v4i32, Legal); + } else { + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + } } - if (Subtarget->hasSSE42() || Subtarget->hasAVX()) + if (Subtarget->hasSSE42()) setOperationAction(ISD::SETCC, MVT::v2i64, Custom); - if (!UseSoftFloat && Subtarget->hasAVX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); @@ -1008,18 +1047,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); - setOperationAction(ISD::SRL, MVT::v4i64, Custom); - setOperationAction(ISD::SRL, MVT::v8i32, Custom); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); - setOperationAction(ISD::SHL, MVT::v4i64, Custom); - setOperationAction(ISD::SHL, MVT::v8i32, Custom); setOperationAction(ISD::SHL, MVT::v16i16, Custom); setOperationAction(ISD::SHL, MVT::v32i8, Custom); - setOperationAction(ISD::SRA, MVT::v8i32, Custom); setOperationAction(ISD::SRA, MVT::v16i16, Custom); + setOperationAction(ISD::SRA, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); @@ -1030,25 +1065,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + + if (Subtarget->hasAVX2()) { + setOperationAction(ISD::ADD, MVT::v4i64, Legal); + setOperationAction(ISD::ADD, MVT::v8i32, Legal); + setOperationAction(ISD::ADD, MVT::v16i16, Legal); + setOperationAction(ISD::ADD, MVT::v32i8, Legal); + + setOperationAction(ISD::SUB, MVT::v4i64, Legal); + setOperationAction(ISD::SUB, MVT::v8i32, Legal); + setOperationAction(ISD::SUB, MVT::v16i16, Legal); + setOperationAction(ISD::SUB, MVT::v32i8, Legal); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Legal); + setOperationAction(ISD::MUL, MVT::v16i16, Legal); + // Don't lower v32i8 because there is no 128-bit byte mul + + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + setOperationAction(ISD::SRL, MVT::v4i64, Legal); + setOperationAction(ISD::SRL, MVT::v8i32, Legal); + + setOperationAction(ISD::SHL, MVT::v4i64, Legal); + setOperationAction(ISD::SHL, MVT::v8i32, Legal); + + setOperationAction(ISD::SRA, MVT::v8i32, Legal); + } else { + setOperationAction(ISD::ADD, MVT::v4i64, Custom); + setOperationAction(ISD::ADD, MVT::v8i32, Custom); + setOperationAction(ISD::ADD, MVT::v16i16, Custom); + setOperationAction(ISD::ADD, MVT::v32i8, Custom); + + setOperationAction(ISD::SUB, MVT::v4i64, Custom); + setOperationAction(ISD::SUB, MVT::v8i32, Custom); + setOperationAction(ISD::SUB, MVT::v16i16, Custom); + setOperationAction(ISD::SUB, MVT::v32i8, Custom); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Custom); + setOperationAction(ISD::MUL, MVT::v16i16, Custom); + // Don't lower v32i8 because there is no 128-bit byte mul - setOperationAction(ISD::ADD, MVT::v4i64, Custom); - setOperationAction(ISD::ADD, MVT::v8i32, Custom); - setOperationAction(ISD::ADD, MVT::v16i16, Custom); - setOperationAction(ISD::ADD, MVT::v32i8, Custom); + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); - setOperationAction(ISD::SUB, MVT::v4i64, Custom); - setOperationAction(ISD::SUB, MVT::v8i32, Custom); - setOperationAction(ISD::SUB, MVT::v16i16, Custom); - setOperationAction(ISD::SUB, MVT::v32i8, Custom); + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); - setOperationAction(ISD::MUL, MVT::v4i64, Custom); - setOperationAction(ISD::MUL, MVT::v8i32, Custom); - setOperationAction(ISD::MUL, MVT::v16i16, Custom); - // Don't lower v32i8 because there is no 128-bit byte mul + setOperationAction(ISD::SRA, MVT::v8i32, Custom); + } // Custom lower several nodes for 256-bit types. for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; @@ -1099,7 +1169,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // of this type with custom code. for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, + Custom); } // We want to custom lower some of our intrinsics. @@ -1137,7 +1208,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); @@ -1152,9 +1222,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); + if (Subtarget->hasBMI()) + setTargetDAGCombine(ISD::XOR); computeRegisterProperties(); @@ -1166,10 +1240,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; - setPrefLoopAlignment(16); + setPrefLoopAlignment(4); // 2^4 bytes. benefitFromCodePlacementOpt = true; - setPrefFunctionAlignment(4); + setPrefFunctionAlignment(4); // 2^4 bytes. } @@ -1219,7 +1293,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { } unsigned Align = 4; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } @@ -1230,7 +1304,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If -/// 'NonScalarIntSafe' is true, that means it's safe to return a +/// 'IsZeroVal' is true, that means it's safe to return a /// non-scalar-integer type, e.g. empty string source, constant, or loaded /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is /// constant so it does not need to be loaded. @@ -1239,31 +1313,34 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool NonScalarIntSafe, + bool IsZeroVal, bool MemcpyStrSrc, MachineFunction &MF) const { // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = MF.getFunction(); - if (NonScalarIntSafe && + if (IsZeroVal && !F->hasFnAttr(Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16))) && Subtarget->getStackAlignment() >= 16) { - if (Subtarget->hasAVX() && - Subtarget->getStackAlignment() >= 32) - return MVT::v8f32; - if (Subtarget->hasXMMInt()) + if (Subtarget->getStackAlignment() >= 32) { + if (Subtarget->hasAVX2()) + return MVT::v8i32; + if (Subtarget->hasAVX()) + return MVT::v8f32; + } + if (Subtarget->hasSSE2()) return MVT::v4i32; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->getStackAlignment() >= 8 && - Subtarget->hasXMMInt()) { + Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; @@ -1428,14 +1505,14 @@ X86TargetLowering::LowerReturn(SDValue Chain, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget->is64Bit() && !Subtarget->hasXMM())) { + (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && - (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) + (Subtarget->is64Bit() && !Subtarget->hasSSE2())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to @@ -1461,7 +1538,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); } } @@ -1501,15 +1578,21 @@ X86TargetLowering::LowerReturn(SDValue Chain, MVT::Other, &RetOps[0], RetOps.size()); } -bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; + SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); - if (Copy->getOpcode() != ISD::CopyToReg && - Copy->getOpcode() != ISD::FP_EXTEND) + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; @@ -1520,7 +1603,11 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { HasRet = true; } - return HasRet; + if (!HasRet) + return false; + + Chain = TCChain; + return true; } EVT @@ -1561,7 +1648,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && - ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -1651,7 +1738,7 @@ static bool IsTailCallConvention(CallingConv::ID CC) { } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) + if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; CallSite CS(CI); @@ -1664,7 +1751,8 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { /// FuncIsMadeTailCallSafe - Return true if the function is being made into /// a tailcall target by changing its ABI. -static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { +static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, + bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && IsTailCallConvention(CC); } @@ -1678,7 +1766,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); + bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, + getTargetMachine().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -1704,7 +1793,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); return DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), - false, false, 0); + false, false, false, 0); } } @@ -1728,6 +1817,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); + bool IsWindows = Subtarget->isTargetWindows(); bool IsWin64 = Subtarget->isTargetWin64(); assert(!(isVarArg && IsTailCallConvention(CallConv)) && @@ -1758,7 +1848,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = X86::GR32RegisterClass; else if (Is64Bit && RegVT == MVT::i64) @@ -1807,7 +1897,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); InVals.push_back(ArgValue); } @@ -1828,7 +1918,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. - if (FuncIsMadeTailCallSafe(CallConv)) + if (FuncIsMadeTailCallSafe(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for @@ -1842,17 +1933,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; // FIXME: We should really autogenerate these arrays - static const unsigned GPR64ArgRegsWin64[] = { + static const uint16_t GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; - static const unsigned GPR64ArgRegs64Bit[] = { + static const uint16_t GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; - static const unsigned XMMArgRegs64Bit[] = { + static const uint16_t XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - const unsigned *GPR64ArgRegs; + const uint16_t *GPR64ArgRegs; unsigned NumXMMRegs = 0; if (IsWin64) { @@ -1865,17 +1956,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, TotalNumIntRegs = 6; TotalNumXMMRegs = 8; GPR64ArgRegs = GPR64ArgRegs64Bit; - NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); + NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, + TotalNumXMMRegs); } unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, TotalNumIntRegs); bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasXMM()) && + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && + assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && + NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); - if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; @@ -1892,8 +1986,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so they - // may be loaded by deferencing the result of va_next. + // registers, then we must store them to their spots on the stack so + // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex( @@ -1953,12 +2047,14 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } // Some CCs need callee pop. - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) + if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + ArgsAreStructReturn(Ins)) FuncInfo->setBytesToPopOnReturn(4); } @@ -2006,7 +2102,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } @@ -2033,7 +2129,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, SDValue X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool &isTailCall, + bool doesNotRet, bool &isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -2042,9 +2138,13 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWindows = Subtarget->isTargetWindows(); bool IsStructRet = CallIsStructReturn(Outs); bool IsSibcall = false; + if (MF.getTarget().Options.DisableTailCalls) + isTailCall = false; + if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, @@ -2053,7 +2153,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Sibcalls are automatically detected tailcalls which do not require // ABI changes. - if (!GuaranteedTailCallOpt && isTailCall) + if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) @@ -2081,7 +2181,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) + else if (getTargetMachine().Options.GuaranteedTailCallOpt && + IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -2231,12 +2332,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. - static const unsigned XMMArgRegs[] = { + static const uint16_t XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); - assert((Subtarget->hasXMM() || !NumXMMRegs) + assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, @@ -2260,7 +2361,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, int FI = 0; // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); - if (GuaranteedTailCallOpt) { + if (getTargetMachine().Options.GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) @@ -2368,7 +2469,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (ExtraLoad) Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(), - false, false, 0); + false, false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -2421,6 +2522,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (Is64Bit && isVarArg && !IsWin64) Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + if (InFlag.getNode()) Ops.push_back(InFlag); @@ -2440,12 +2547,15 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPush; - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + getTargetMachine().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything - else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) + else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + IsStructRet) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. + // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPush = 4; else NumBytesForCalleeToPush = 0; // Callee pops nothing. @@ -2598,7 +2708,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; - if (GuaranteedTailCallOpt) { + if (getTargetMachine().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; @@ -2641,9 +2751,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; } - // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. - // Therefore if it's not used by the call it is not safe to optimize this into - // a sibcall. + // If the call result is in ST0 / ST1, it needs to be popped off the x87 + // stack. Therefore, if it's not used by the call it is not safe to optimize + // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { @@ -2785,9 +2895,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::SHUFPD: + case X86ISD::SHUFP: case X86ISD::PALIGN: - case X86ISD::SHUFPS: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: @@ -2798,34 +2907,16 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: - case X86ISD::VPERM2F128: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: return true; } - return false; } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, - SDValue V1, SelectionDAG &DAG) { + SDValue V1, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVSHDUP: @@ -2833,39 +2924,32 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVDDUP: return DAG.getNode(Opc, dl, VT, V1); } - - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, - SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { + SDValue V1, unsigned TargetMask, + SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: + case X86ISD::VPERMILP: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } - - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, - SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { + SDValue V1, SDValue V2, unsigned TargetMask, + SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PALIGN: - case X86ISD::SHUFPD: - case X86ISD::SHUFPS: - case X86ISD::VPERM2F128: + case X86ISD::SHUFP: + case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, @@ -2879,25 +2963,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } - return SDValue(); } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { @@ -3092,17 +3161,6 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } -/// isUndefOrInRange - Return true if every element in Mask, begining -/// from position Pos and ending in Pos+Size, falls within the specified -/// range (L, L+Pos]. or is undef. -static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask, - int Pos, int Size, int Low, int Hi) { - for (int i = Pos, e = Pos+Size; i != e; ++i) - if (!isUndefOrInRange(Mask[i], Low, Hi)) - return false; - return true; -} - /// isUndefOrEqual - Val is either less than zero (undef) or equal to the /// specified value. static bool isUndefOrEqual(int Val, int CmpVal) { @@ -3114,7 +3172,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) { /// isSequentialOrUndefInRange - Return true if every element in Mask, begining /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (L, L+Pos]. or is undef. -static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, +static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, int Pos, int Size, int Low) { for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) @@ -3125,7 +3183,7 @@ static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. -static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { if (VT == MVT::v4f32 || VT == MVT::v4i32 ) return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -3133,302 +3191,188 @@ static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { return false; } -bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFDMask(M, N->getValueType(0)); -} - /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) { if (VT != MVT::v8i16) return false; // Lower quadword copied in order or undef. - for (int i = 0; i != 4; ++i) - if (Mask[i] >= 0 && Mask[i] != i) - return false; + if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) + return false; // Upper quadword shuffled. - for (int i = 4; i != 8; ++i) + for (unsigned i = 4; i != 8; ++i) if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) return false; return true; } -bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFHWMask(M, N->getValueType(0)); -} - /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) { if (VT != MVT::v8i16) return false; // Upper quadword copied in order. - for (int i = 4; i != 8; ++i) - if (Mask[i] >= 0 && Mask[i] != i) - return false; + if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) + return false; // Lower quadword shuffled. - for (int i = 0; i != 4; ++i) + for (unsigned i = 0; i != 4; ++i) if (Mask[i] >= 4) return false; return true; } -bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFLWMask(M, N->getValueType(0)); -} - /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PALIGNR. -static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool hasSSSE3OrAVX) { - int i, e = VT.getVectorNumElements(); - if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) +static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || + (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())) return false; - // Do not handle v2i64 / v2f64 shuffles with palignr. - if (e < 4 || !hasSSSE3OrAVX) + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + // Do not handle 64-bit element shuffles with palignr. + if (NumLaneElts == 2) return false; - for (i = 0; i != e; ++i) - if (Mask[i] >= 0) - break; + for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { + unsigned i; + for (i = 0; i != NumLaneElts; ++i) { + if (Mask[i+l] >= 0) + break; + } - // All undef, not a palignr. - if (i == e) - return false; + // Lane is all undef, go to next lane + if (i == NumLaneElts) + continue; - // Make sure we're shifting in the right direction. - if (Mask[i] <= i) - return false; + int Start = Mask[i+l]; - int s = Mask[i] - i; + // Make sure its in this lane in one of the sources + if (!isUndefOrInRange(Start, l, l+NumLaneElts) && + !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) + return false; - // Check the rest of the elements to see if they are consecutive. - for (++i; i != e; ++i) { - int m = Mask[i]; - if (m >= 0 && m != s+i) + // If not lane 0, then we must match lane 0 + if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) return false; - } - return true; -} -/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// VSHUFPSY. -static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - int NumElems = VT.getVectorNumElements(); + // Correct second source to be contiguous with first source + if (Start >= (int)NumElts) + Start -= NumElts - NumLaneElts; - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) - return false; + // Make sure we're shifting in the right direction. + if (Start <= (int)(i+l)) + return false; - if (NumElems != 8) - return false; + Start -= i; - // VSHUFPSY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 - // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 - // - // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, - // Y3..Y0, Y3..Y0, X3..X0, X3..X0 - // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; + // Check the rest of the elements to see if they are consecutive. + for (++i; i != NumLaneElts; ++i) { + int Idx = Mask[i+l]; - // The mask of the second half must be the same as the first but with - // the appropriate offsets. This works in the same way as VPERMILPS - // works with masks. - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - } - for (int i = QuarterSize*3; i < NumElems; ++i) { - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; + // Make sure its in this lane + if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && + !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) + return false; + + // If not lane 0, then we must match lane 0 + if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) + return false; + + if (Idx >= (int)NumElts) + Idx -= NumElts - NumLaneElts; + if (!isUndefOrEqual(Idx, Start+i)) + return false; + + } } return true; } -/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPSY instruction. -static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - - assert(NumElems == 8 && VT.getSizeInBits() == 256 && - "Only supports v8i32 and v8f32 types"); - - int HalfSize = NumElems/2; - unsigned Mask = 0; - for (int i = 0; i != NumElems ; ++i) { - if (SVOp->getMaskElt(i) < 0) +/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming +/// the two vector operands have swapped position. +static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, + unsigned NumElems) { + for (unsigned i = 0; i != NumElems; ++i) { + int idx = Mask[i]; + if (idx < 0) continue; - // The mask of the first half must be equal to the second one. - unsigned Shamt = (i%HalfSize)*2; - unsigned Elt = SVOp->getMaskElt(i) % HalfSize; - Mask |= Elt << Shamt; + else if (idx < (int)NumElems) + Mask[i] = idx + NumElems; + else + Mask[i] = idx - NumElems; } - - return Mask; } -/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS -/// version and the mask of the second half isn't binded with the first -/// one. -static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - int NumElems = VT.getVectorNumElements(); - - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) +/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 128/256-bit +/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be +/// reverse of what x86 shuffles want. +static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX, + bool Commuted = false) { + if (!HasAVX && VT.getSizeInBits() == 256) return false; - if (NumElems != 4) + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElems = NumElems/NumLanes; + + if (NumLaneElems != 2 && NumLaneElems != 4) return false; // VSHUFPSY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. // + // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 + // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 + // + // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, + // Y3..Y0, Y3..Y0, X3..X0, X3..X0 + // + // VSHUFPDY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // // SRC1 => X3 X2 X1 X0 // SRC2 => Y3 Y2 Y1 Y0 // - // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0 + // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - for (int i = QuarterSize*3; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; - - return true; -} - -/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPDY instruction. -static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - - assert(NumElems == 4 && VT.getSizeInBits() == 256 && - "Only supports v4i64 and v4f64 types"); - - int HalfSize = NumElems/2; - unsigned Mask = 0; - for (int i = 0; i != NumElems ; ++i) { - if (SVOp->getMaskElt(i) < 0) - continue; - int Elt = SVOp->getMaskElt(i) % HalfSize; - Mask |= Elt << i; + unsigned HalfLaneElems = NumLaneElems/2; + for (unsigned l = 0; l != NumElems; l += NumLaneElems) { + for (unsigned i = 0; i != NumLaneElems; ++i) { + int Idx = Mask[i+l]; + unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); + if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) + return false; + // For VSHUFPSY, the mask of the second half must be the same as the + // first but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. + if (NumElems != 8 || l == 0 || Mask[i] < 0) + continue; + if (!isUndefOrEqual(Idx, Mask[i]+l)) + return false; + } } - return Mask; -} - -/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 128-bit -/// SHUFPS and SHUFPD. -static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - - if (VT.getSizeInBits() != 128) - return false; - - if (NumElems != 2 && NumElems != 4) - return false; - - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) - return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) - return false; - - return true; -} - -bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isSHUFPMask(M, N->getValueType(0)); -} - -/// isCommutedSHUFP - Returns true if the shuffle mask is exactly -/// the reverse of what x86 shuffles want. x86 shuffles requires the lower -/// half elements to come from vector 1 (which would equal the dest.) and -/// the upper half to come from vector 2. -static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) - return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) - return false; return true; } -static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return isCommutedSHUFPMask(M, N->getValueType(0)); -} - /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. -bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); +static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) @@ -3438,17 +3382,16 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { return false; // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 - return isUndefOrEqual(N->getMaskElt(0), 6) && - isUndefOrEqual(N->getMaskElt(1), 7) && - isUndefOrEqual(N->getMaskElt(2), 2) && - isUndefOrEqual(N->getMaskElt(3), 3); + return isUndefOrEqual(Mask[0], 6) && + isUndefOrEqual(Mask[1], 7) && + isUndefOrEqual(Mask[2], 2) && + isUndefOrEqual(Mask[3], 3); } /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> -bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); +static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) @@ -3457,26 +3400,29 @@ bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { if (NumElems != 4) return false; - return isUndefOrEqual(N->getMaskElt(0), 2) && - isUndefOrEqual(N->getMaskElt(1), 3) && - isUndefOrEqual(N->getMaskElt(2), 2) && - isUndefOrEqual(N->getMaskElt(3), 3); + return isUndefOrEqual(Mask[0], 2) && + isUndefOrEqual(Mask[1], 3) && + isUndefOrEqual(Mask[2], 2) && + isUndefOrEqual(Mask[3], 3); } /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. -bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { - unsigned NumElems = N->getValueType(0).getVectorNumElements(); +static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { + if (VT.getSizeInBits() != 128) + return false; + + unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i], i + NumElems)) return false; - for (unsigned i = NumElems/2; i < NumElems; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + for (unsigned i = NumElems/2; i != NumElems; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; return true; @@ -3484,19 +3430,19 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. -bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { - unsigned NumElems = N->getValueType(0).getVectorNumElements(); +static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { + unsigned NumElems = VT.getVectorNumElements(); if ((NumElems != 2 && NumElems != 4) - || N->getValueType(0).getSizeInBits() > 128) + || VT.getSizeInBits() > 128) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems)) return false; return true; @@ -3504,14 +3450,15 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. -static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool V2IsSplat = false) { - int NumElts = VT.getVectorNumElements(); +static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, + bool HasAVX2, bool V2IsSplat = false) { + unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3519,11 +3466,9 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - unsigned Start = 0; - unsigned End = NumLaneElts; - for (unsigned s = 0; s < NumLanes; ++s) { - for (unsigned i = Start, j = s * NumLaneElts; - i != End; + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; @@ -3537,30 +3482,22 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, return false; } } - // Process the next 128 bits. - Start += NumLaneElts; - End += NumLaneElts; } return true; } -bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); -} - /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. -static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool V2IsSplat = false) { - int NumElts = VT.getVectorNumElements(); +static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, + bool HasAVX2, bool V2IsSplat = false) { + unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3568,11 +3505,9 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - unsigned Start = 0; - unsigned End = NumLaneElts; for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; - i != End; i += 2, ++j) { + for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; if (!isUndefOrEqual(BitI, j)) @@ -3585,42 +3520,39 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, return false; } } - // Process the next 128 bits. - Start += NumLaneElts; - End += NumLaneElts; } return true; } -bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); -} - /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> -static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) +static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, + bool HasAVX2) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern // FIXME: Need a better way to get rid of this, there's no latency difference // between UNPCKLPD and MOVDDUP, the later should always be checked first and // the former later. We should also remove the "_undef" special mask. - if (NumElems == 4 && VT.getSizeInBits() == 256) + if (NumElts == 4 && VT.getSizeInBits() == 256) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElems / NumLanes; + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned s = 0; s < NumLanes; ++s) { - for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; - i != NumLaneElts * (s + 1); + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; @@ -3635,81 +3567,77 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { return true; } -bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); -} - /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, /// <2, 2, 3, 3> -static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) +static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; - for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; + i != (l+1)*NumLaneElts; i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } } return true; } -bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); -} - /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. -static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { if (VT.getVectorElementType().getSizeInBits() < 32) return false; + if (VT.getSizeInBits() == 256) + return false; - int NumElts = VT.getVectorNumElements(); + unsigned NumElts = VT.getVectorNumElements(); if (!isUndefOrEqual(Mask[0], NumElts)) return false; - for (int i = 1; i < NumElts; ++i) + for (unsigned i = 1; i != NumElts; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; return true; } -bool X86::isMOVLMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isMOVLMask(M, N->getValueType(0)); -} - -/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered +/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered /// as permutations between 128-bit chunks or halves. As an example: this /// shuffle bellow: /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) +static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { + if (!HasAVX || VT.getSizeInBits() != 256) return false; // The shuffle result is divided into half A and half B. In total the two // sources have 4 halves, namely: C, D, E, F. The final values of A and // B must come from C, D, E or F. - int HalfSize = VT.getVectorNumElements()/2; + unsigned HalfSize = VT.getVectorNumElements()/2; bool MatchA = false, MatchB = false; // Check if A comes from one of C, D, E, F. - for (int Half = 0; Half < 4; ++Half) { + for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { MatchA = true; break; @@ -3717,7 +3645,7 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, } // Check if B comes from one of C, D, E, F. - for (int Half = 0; Half < 4; ++Half) { + for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { MatchB = true; break; @@ -3727,22 +3655,21 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, return MatchA && MatchB; } -/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERM2F128 instructions. -static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. +static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); - int HalfSize = VT.getVectorNumElements()/2; + unsigned HalfSize = VT.getVectorNumElements()/2; - int FstHalf = 0, SndHalf = 0; - for (int i = 0; i < HalfSize; ++i) { + unsigned FstHalf = 0, SndHalf = 0; + for (unsigned i = 0; i < HalfSize; ++i) { if (SVOp->getMaskElt(i) > 0) { FstHalf = SVOp->getMaskElt(i)/HalfSize; break; } } - for (int i = HalfSize; i < HalfSize*2; ++i) { + for (unsigned i = HalfSize; i < HalfSize*2; ++i) { if (SVOp->getMaskElt(i) > 0) { SndHalf = SVOp->getMaskElt(i)/HalfSize; break; @@ -3752,141 +3679,56 @@ static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { return (FstHalf | (SndHalf << 4)); } -/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand +/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. -static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - - if (!Subtarget->hasAVX()) - return false; - - // Only match 256-bit with 64-bit types - if (VT.getSizeInBits() != 256 || NumElts != 4) +/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. +static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { + if (!HasAVX) return false; - // The mask on the high lane is independent of the low. Both can match - // any element in inside its own lane, but can't cross. - int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int LaneStart = l*LaneSize; - if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) - return false; - } - - return true; -} - -/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. -/// Note that VPERMIL mask matching is different depending whether theunderlying -/// type is 32 or 64. In the VPERMILPS the high half of the mask should point -/// to the same elements of the low, but to the higher half of the source. -/// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. -static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - - if (!Subtarget->hasAVX()) + // Only match 256-bit with 32/64-bit types + if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) return false; - // Only match 256-bit with 32-bit types - if (VT.getSizeInBits() != 256 || NumElts != 8) - return false; - - // The mask on the high lane should be the same as the low. Actually, - // they can differ if any of the corresponding index in a lane is undef - // and the other stays in range. - int LaneSize = NumElts/NumLanes; - for (int i = 0; i < LaneSize; ++i) { - int HighElt = i+LaneSize; - bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); - bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); - - if (!HighValid || !LowValid) - return false; - if (Mask[i] < 0 || Mask[HighElt] < 0) - continue; - if (Mask[HighElt]-Mask[i] != LaneSize) - return false; - } - - return true; -} - -/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPS* instructions. -static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - int LaneSize = NumElts/NumLanes; - - // Although the mask is equal for both lanes do it twice to get the cases - // where a mask will match because the same mask element is undef on the - // first half but valid on the second. This would get pathological cases - // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. - unsigned Mask = 0; - for (int l = 0; l < NumLanes; ++l) { - for (int i = 0; i < LaneSize; ++i) { - int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); - if (MaskElt < 0) + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned LaneSize = NumElts/NumLanes; + for (unsigned l = 0; l != NumElts; l += LaneSize) { + for (unsigned i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) + return false; + if (NumElts != 8 || l == 0) continue; - if (MaskElt >= LaneSize) - MaskElt -= LaneSize; - Mask |= MaskElt << (i*2); - } - } - - return Mask; -} - -/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPD* instructions. -static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - - unsigned Mask = 0; - int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int MaskElt = SVOp->getMaskElt(i); - if (MaskElt < 0) + // VPERMILPS handling + if (Mask[i] < 0) continue; - Mask |= (MaskElt-l*LaneSize) << i; + if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) + return false; } + } - return Mask; + return true; } -/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse +/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. -static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, +static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { - int NumOps = VT.getVectorNumElements(); + unsigned NumOps = VT.getVectorNumElements(); + if (VT.getSizeInBits() == 256) + return false; if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) return false; if (!isUndefOrEqual(Mask[0], 0)) return false; - for (int i = 1; i < NumOps; ++i) + for (unsigned i = 1; i != NumOps; ++i) if (!(isUndefOrEqual(Mask[i], i+NumOps) || (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) @@ -3895,26 +3737,14 @@ static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, return true; } -static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, - bool V2IsUndef = false) { - SmallVector<int, 8> M; - N->getMask(M); - return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); -} - /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> -bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) - return false; - - // The second vector must be undef - if (N->getOperand(1).getOpcode() != ISD::UNDEF) +static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3()) return false; - EVT VT = N->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); if ((VT.getSizeInBits() == 128 && NumElems != 4) || @@ -3922,9 +3752,9 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, return false; // "i+1" is the value the indexed mask element must have - for (unsigned i = 0; i < NumElems; i += 2) - if (!isUndefOrEqual(N->getMaskElt(i), i+1) || - !isUndefOrEqual(N->getMaskElt(i+1), i+1)) + for (unsigned i = 0; i != NumElems; i += 2) + if (!isUndefOrEqual(Mask[i], i+1) || + !isUndefOrEqual(Mask[i+1], i+1)) return false; return true; @@ -3933,16 +3763,11 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> -bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) - return false; - - // The second vector must be undef - if (N->getOperand(1).getOpcode() != ISD::UNDEF) +static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3()) return false; - EVT VT = N->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); if ((VT.getSizeInBits() == 128 && NumElems != 4) || @@ -3950,9 +3775,9 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, return false; // "i" is the value the indexed mask element must have - for (unsigned i = 0; i < NumElems; i += 2) - if (!isUndefOrEqual(N->getMaskElt(i), i) || - !isUndefOrEqual(N->getMaskElt(i+1), i)) + for (unsigned i = 0; i != NumElems; i += 2) + if (!isUndefOrEqual(Mask[i], i) || + !isUndefOrEqual(Mask[i+1], i)) return false; return true; @@ -3961,21 +3786,17 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. -static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); - int NumElts = VT.getVectorNumElements(); - bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF; +static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { + unsigned NumElts = VT.getVectorNumElements(); - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 || - !V2IsUndef || NumElts != 4) + if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) return false; - for (int i = 0; i != NumElts/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), 0)) + for (unsigned i = 0; i != NumElts/2; ++i) + if (!isUndefOrEqual(Mask[i], 0)) return false; - for (int i = NumElts/2; i != NumElts; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2)) + for (unsigned i = NumElts/2; i != NumElts; ++i) + if (!isUndefOrEqual(Mask[i], NumElts/2)) return false; return true; } @@ -3983,18 +3804,16 @@ static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. -bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); - +static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; - int e = VT.getVectorNumElements() / 2; - for (int i = 0; i < e; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + unsigned e = VT.getVectorNumElements() / 2; + for (unsigned i = 0; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; - for (int i = 0; i < e; ++i) - if (!isUndefOrEqual(N->getMaskElt(e+i), i)) + for (unsigned i = 0; i != e; ++i) + if (!isUndefOrEqual(Mask[e+i], i)) return false; return true; } @@ -4039,31 +3858,43 @@ bool X86::isVINSERTF128Index(SDNode *N) { /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. -unsigned X86::getShuffleSHUFImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - int NumOperands = SVOp->getValueType(0).getVectorNumElements(); +/// Handles 128-bit and 256-bit. +static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for PSHUF/SHUFP"); - unsigned Shift = (NumOperands == 4) ? 2 : 1; + // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate + // independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + assert((NumLaneElts == 2 || NumLaneElts == 4) && + "Only supports 2 or 4 elements per lane"); + + unsigned Shift = (NumLaneElts == 4) ? 1 : 0; unsigned Mask = 0; - for (int i = 0; i < NumOperands; ++i) { - int Val = SVOp->getMaskElt(NumOperands-i-1); - if (Val < 0) Val = 0; - if (Val >= NumOperands) Val -= NumOperands; - Mask |= Val; - if (i != NumOperands - 1) - Mask <<= Shift; + for (unsigned i = 0; i != NumElts; ++i) { + int Elt = N->getMaskElt(i); + if (Elt < 0) continue; + Elt %= NumLaneElts; + unsigned ShAmt = i << Shift; + if (ShAmt >= 8) ShAmt -= 8; + Mask |= Elt << ShAmt; } + return Mask; } /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. -unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { unsigned Mask = 0; // 8 nodes, but we only care about the last 4. for (unsigned i = 7; i >= 4; --i) { - int Val = SVOp->getMaskElt(i); + int Val = N->getMaskElt(i); if (Val >= 0) Mask |= (Val - 4); if (i != 4) @@ -4074,12 +3905,11 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. -unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { unsigned Mask = 0; // 8 nodes, but we only care about the first 4. for (int i = 3; i >= 0; --i) { - int Val = SVOp->getMaskElt(i); + int Val = N->getMaskElt(i); if (Val >= 0) Mask |= Val; if (i != 0) @@ -4090,18 +3920,24 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. -unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VVT = N->getValueType(0); - unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; - int Val = 0; +static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; - unsigned i, e; - for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { + int Val = 0; + unsigned i; + for (i = 0; i != NumElts; ++i) { Val = SVOp->getMaskElt(i); if (Val >= 0) break; } + if (Val >= (int)NumElts) + Val -= NumElts - NumLaneElts; + assert(Val - i > 0 && "PALIGNR imm should be positive"); return (Val - i) * EltSize; } @@ -4170,36 +4006,20 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SVOp->getOperand(0), &MaskVec[0]); } -/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming -/// the two vector operands have swapped position. -static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { - unsigned NumElems = VT.getVectorNumElements(); - for (unsigned i = 0; i != NumElems; ++i) { - int idx = Mask[i]; - if (idx < 0) - continue; - else if (idx < (int)NumElems) - Mask[i] = idx + NumElems; - else - Mask[i] = idx - NumElems; - } -} - /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). -static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { - EVT VT = Op->getValueType(0); +static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; if (VT.getVectorNumElements() != 4) return false; for (unsigned i = 0, e = 2; i != e; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) + if (!isUndefOrEqual(Mask[i], i+2)) return false; for (unsigned i = 2; i != 4; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) + if (!isUndefOrEqual(Mask[i], i+4)) return false; return true; } @@ -4218,14 +4038,36 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { return true; } +// Test whether the given value is a vector value which will be legalized +// into a load. +static bool WillBeConstantPoolLoad(SDNode *N) { + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + // Check for any non-constant elements. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + switch (N->getOperand(i).getNode()->getOpcode()) { + case ISD::UNDEF: + case ISD::ConstantFP: + case ISD::Constant: + break; + default: + return false; + } + + // Vectors of all-zeros and all-ones are materialized with special + // instructions rather than being loaded. + return !ISD::isBuildVectorAllZeros(N) && + !ISD::isBuildVectorAllOnes(N); +} + /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to /// match movlp{s|d}. The lower half elements should come from lower half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). And since V1 will become the source of the /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, - ShuffleVectorSDNode *Op) { - EVT VT = Op->getValueType(0); + ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; @@ -4233,7 +4075,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, return false; // Is V2 is a vector load, don't do this transformation. We will try to use // load folding shufps op. - if (ISD::isNON_EXTLoad(V2)) + if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) return false; unsigned NumElems = VT.getVectorNumElements(); @@ -4241,10 +4083,10 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i)) + if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = NumElems/2; i != NumElems; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) + if (!isUndefOrEqual(Mask[i], i+NumElems)) return false; return true; } @@ -4292,15 +4134,15 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { /// getZeroVector - Returns a vector of specified type with all zero elements. /// -static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, - DebugLoc dl) { +static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.getSizeInBits() == 128) { // SSE - if (HasXMMInt) { // SSE2 + if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 @@ -4308,34 +4150,46 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.getSizeInBits() == 256) { // AVX - // 256-bit logic and arithmetic instructions in AVX are - // all floating-point, no support for integer ops. Default - // to emitting fp zeroed vectors then. - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + if (Subtarget->hasAVX2()) { // AVX2 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + } else { + // 256-bit logic and arithmetic instructions in AVX are all + // floating-point, no support for integer ops. Emit fp zeroed vectors. + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + } } return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } /// getOnesVector - Returns a vector of specified type with all bits set. -/// Always build ones vectors as <4 x i32>. For 256-bit types, use two -/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their -/// original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { +/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with +/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Then bitcast to their original type, ensuring they get CSE'd. +static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, + DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); assert((VT.is128BitVector() || VT.is256BitVector()) && "Expected a 128-bit or 256-bit vector type"); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, - Cst, Cst, Cst, Cst); - - if (VT.is256BitVector()) { - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), - Vec, DAG.getConstant(0, MVT::i32), DAG, dl); - Vec = Insert128BitVector(InsV, Vec, - DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + SDValue Vec; + if (VT.getSizeInBits() == 256) { + if (HasAVX2) { // AVX2 + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + } else { // AVX + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), + Vec, DAG.getConstant(0, MVT::i32), DAG, dl); + Vec = Insert128BitVector(InsV, Vec, + DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + } + } else { + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } return DAG.getNode(ISD::BITCAST, dl, VT, Vec); @@ -4343,24 +4197,12 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements /// that point to V2 points to its first element. -static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - bool Changed = false; - SmallVector<int, 8> MaskVec; - SVOp->getMask(MaskVec); - +static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { for (unsigned i = 0; i != NumElems; ++i) { - if (MaskVec[i] > (int)NumElems) { - MaskVec[i] = NumElems; - Changed = true; + if (Mask[i] > (int)NumElems) { + Mask[i] = NumElems; } } - if (Changed) - return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), - SVOp->getOperand(1), &MaskVec[0]); - return SDValue(SVOp, 0); } /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd @@ -4464,7 +4306,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. if (Size == 256) { - unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; + unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0; V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); if (Idx > 0) EltNo -= NumElems/2; @@ -4496,11 +4338,12 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { /// element of V2 is swizzled into the zero/undef vector, landing at element /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, - bool isZero, bool HasXMMInt, + bool IsZero, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { EVT VT = V2.getValueType(); - SDValue V1 = isZero - ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); + SDValue V1 = IsZero + ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 16> MaskVec; for (unsigned i = 0; i != NumElems; ++i) @@ -4509,9 +4352,81 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); } +/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the +/// target specific opcode. Returns true if the Mask could be calculated. +/// Sets IsUnary to true if only uses one source. +static bool getTargetShuffleMask(SDNode *N, EVT VT, + SmallVectorImpl<int> &Mask, bool &IsUnary) { + unsigned NumElems = VT.getVectorNumElements(); + SDValue ImmN; + + IsUnary = false; + switch(N->getOpcode()) { + case X86ISD::SHUFP: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; + case X86ISD::UNPCKH: + DecodeUNPCKHMask(VT, Mask); + break; + case X86ISD::UNPCKL: + DecodeUNPCKLMask(VT, Mask); + break; + case X86ISD::MOVHLPS: + DecodeMOVHLPSMask(NumElems, Mask); + break; + case X86ISD::MOVLHPS: + DecodeMOVLHPSMask(NumElems, Mask); + break; + case X86ISD::PSHUFD: + case X86ISD::VPERMILP: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFHW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFLW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::MOVSS: + case X86ISD::MOVSD: { + // The index 0 always comes from the first element of the second source, + // this is why MOVSS and MOVSD are used in the first place. The other + // elements come from the other positions of the first source vector + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) { + Mask.push_back(i); + } + break; + } + case X86ISD::VPERM2X128: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; + case X86ISD::MOVDDUP: + case X86ISD::MOVLHPD: + case X86ISD::MOVLPD: + case X86ISD::MOVLPS: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::PALIGN: + // Not yet implemented + return false; + default: llvm_unreachable("unknown target shuffle node"); + } + + return true; +} + /// getShuffleScalarElt - Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. -static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, +static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. @@ -4522,129 +4437,34 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { - Index = SV->getMaskElt(Index); + int Elt = SV->getMaskElt(Index); - if (Index < 0) + if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); - int NumElems = VT.getVectorNumElements(); - SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); - return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); + unsigned NumElems = VT.getVectorNumElements(); + SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) + : SV->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { - int NumElems = VT.getVectorNumElements(); - SmallVector<unsigned, 16> ShuffleMask; + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 16> ShuffleMask; SDValue ImmN; + bool IsUnary; - switch(Opcode) { - case X86ISD::SHUFPS: - case X86ISD::SHUFPD: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeSHUFPSMask(NumElems, - cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - DecodePUNPCKHMask(NumElems, ShuffleMask); - break; - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - DecodeUNPCKHPMask(NumElems, ShuffleMask); - break; - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - DecodePUNPCKLMask(VT, ShuffleMask); - break; - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - DecodeUNPCKLPMask(VT, ShuffleMask); - break; - case X86ISD::MOVHLPS: - DecodeMOVHLPSMask(NumElems, ShuffleMask); - break; - case X86ISD::MOVLHPS: - DecodeMOVLHPSMask(NumElems, ShuffleMask); - break; - case X86ISD::PSHUFD: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFMask(NumElems, - cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::PSHUFHW: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::PSHUFLW: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::MOVSS: - case X86ISD::MOVSD: { - // The index 0 always comes from the first element of the second source, - // this is why MOVSS and MOVSD are used in the first place. The other - // elements come from the other positions of the first source vector. - unsigned OpNum = (Index == 0) ? 1 : 0; - return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, - Depth+1); - } - case X86ISD::VPERMILPS: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPSY: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPD: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPDY: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERM2F128: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::MOVDDUP: - case X86ISD::MOVLHPD: - case X86ISD::MOVLPD: - case X86ISD::MOVLPS: - case X86ISD::MOVSHDUP: - case X86ISD::MOVSLDUP: - case X86ISD::PALIGN: - return SDValue(); // Not yet implemented. - default: - assert(0 && "unknown target shuffle node"); + if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary)) return SDValue(); - } - Index = ShuffleMask[Index]; - if (Index < 0) + int Elt = ShuffleMask[Index]; + if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); - SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); - return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, + SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) + : N->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -4660,7 +4480,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) - : DAG.getUNDEF(VT.getVectorElementType()); + : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); @@ -4672,38 +4492,37 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, /// shuffle operation which come from a consecutively from a zero. The /// search can start in two different directions, from left or right. static -unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, +unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, bool ZerosFromLeft, SelectionDAG &DAG) { - int i = 0; - - while (i < NumElems) { + unsigned i; + for (i = 0; i != NumElems; ++i) { unsigned Index = ZerosFromLeft ? i : NumElems-i-1; - SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); + SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); if (!(Elt.getNode() && (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) break; - ++i; } return i; } -/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to -/// MaskE correspond consecutively to elements from one of the vector operands, +/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) +/// correspond consecutively to elements from one of the vector operands, /// starting from its index OpIdx. Also tell OpNum which source vector operand. static -bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, - int OpIdx, int NumElems, unsigned &OpNum) { +bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, + unsigned MaskI, unsigned MaskE, unsigned OpIdx, + unsigned NumElems, unsigned &OpNum) { bool SeenV1 = false; bool SeenV2 = false; - for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { + for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { int Idx = SVOp->getMaskElt(i); // Ignore undef indicies if (Idx < 0) continue; - if (Idx < NumElems) + if (Idx < (int)NumElems) SeenV1 = true; else SeenV2 = true; @@ -4738,7 +4557,7 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, // if (!isShuffleMaskConsecutive(SVOp, 0, // Mask Start Index - NumElems-NumZeros-1, // Mask End Index + NumElems-NumZeros, // Mask End Index(exclusive) NumZeros, // Where to start looking in the src vector NumElems, // Number of elements in vector OpSrc)) // Which source operand ? @@ -4771,7 +4590,7 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, // if (!isShuffleMaskConsecutive(SVOp, NumZeros, // Mask Start Index - NumElems-1, // Mask End Index + NumElems, // Mask End Index(exclusive) 0, // Where to start looking in the src vector NumElems, // Number of elements in vector OpSrc)) // Which source operand ? @@ -4804,6 +4623,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); @@ -4815,7 +4635,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) - V = getZeroVector(MVT::v8i16, true, DAG, dl); + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; @@ -4851,6 +4671,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); @@ -4863,7 +4684,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, if (isNonZero) { if (First) { if (NumZero) - V = getZeroVector(MVT::v8i16, true, DAG, dl); + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; @@ -4884,7 +4705,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, const TargetLowering &TLI, DebugLoc dl) { assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; - unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; + unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, @@ -4952,21 +4773,16 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, int EltNo = (Offset - StartOffset) >> 2; int NumElems = VT.getVectorNumElements(); - EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), - false, false, 0); + false, false, false, 0); - // Canonicalize it to a v4i32 or v8i32 shuffle. SmallVector<int, 8> Mask; for (int i = 0; i < NumElems; ++i) Mask.push_back(EltNo); - V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); - return DAG.getNode(ISD::BITCAST, dl, NVT, - DAG.getVectorShuffle(CanonVT, dl, V1, - DAG.getUNDEF(CanonVT),&Mask[0])); + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } return SDValue(); @@ -5021,11 +4837,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), 0); + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), 0); return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->getAlignment()); + LDBase->isInvariant(), LDBase->getAlignment()); } else if (NumElems == 4 && LastLoadedElt == 1 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); @@ -5041,6 +4858,137 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, return SDValue(); } +/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction +/// to generate a splat value for the following cases: +/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. +/// 2. A splat shuffle which uses a scalar_to_vector node which comes from +/// a scalar load, or a constant. +/// The VBROADCAST node is returned when a pattern is found, +/// or SDValue() otherwise. +SDValue +X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { + if (!Subtarget->hasAVX()) + return SDValue(); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + SDValue Ld; + bool ConstSplatVal; + + switch (Op.getOpcode()) { + default: + // Unknown pattern found. + return SDValue(); + + case ISD::BUILD_VECTOR: { + // The BUILD_VECTOR node must be a splat. + if (!isSplatVector(Op.getNode())) + return SDValue(); + + Ld = Op.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); + + // The suspected load node has several users. Make sure that all + // of its users are from the BUILD_VECTOR node. + // Constants may have multiple users. + if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + return SDValue(); + break; + } + + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + + // Shuffles must have a splat mask where the first element is + // broadcasted. + if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) + return SDValue(); + + SDValue Sc = Op.getOperand(0); + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) + return SDValue(); + + Ld = Sc.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); + + // The scalar_to_vector node and the suspected + // load node must have exactly one user. + // Constants may have multiple users. + if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) + return SDValue(); + break; + } + } + + bool Is256 = VT.getSizeInBits() == 256; + bool Is128 = VT.getSizeInBits() == 128; + + // Handle the broadcasting a single constant scalar from the constant pool + // into a vector. On Sandybridge it is still better to load a constant vector + // from the constant pool and not to broadcast it from a scalar. + if (ConstSplatVal && Subtarget->hasAVX2()) { + EVT CVT = Ld.getValueType(); + assert(!CVT.isVector() && "Must not broadcast a vector type"); + unsigned ScalarSize = CVT.getSizeInBits(); + + if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) || + (Is128 && (ScalarSize == 32))) { + + const Constant *C = 0; + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) + C = CI->getConstantIntValue(); + else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) + C = CF->getConstantFPValue(); + + assert(C && "Invalid constant type"); + + SDValue CP = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + } + + // The scalar source must be a normal load. + if (!ISD::isNormalLoad(Ld.getNode())) + return SDValue(); + + // Reject loads that have uses of the chain result + if (Ld->hasAnyUseOfValue(1)) + return SDValue(); + + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); + + // VBroadcast to YMM + if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // VBroadcast to XMM + if (Is128 && (ScalarSize == 32)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // The integer check is needed for the 64-bit into 128-bit so it doesn't match + // double since there is vbroadcastsd xmm + if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { + // VBroadcast to YMM + if (Is256 && (ScalarSize == 8 || ScalarSize == 16)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // VBroadcast to XMM + if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + + // Unsupported broadcast. + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -5053,22 +5001,26 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. - if (Op.getValueType() == MVT::v4i32 || - Op.getValueType() == MVT::v8i32) + if (VT == MVT::v4i32 || VT == MVT::v8i32) return Op; - return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl); + return getZeroVector(VT, Subtarget, DAG, dl); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width - // vectors or broken into v4i32 operations on 256-bit vectors. + // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use + // vpcmpeqd on 256-bit vectors. if (ISD::isBuildVectorAllOnes(Op.getNode())) { - if (Op.getValueType() == MVT::v4i32) + if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2())) return Op; - return getOnesVector(Op.getValueType(), DAG, dl); + return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); } + SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + if (Broadcast.getNode()) + return Broadcast; + unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; @@ -5118,8 +5070,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, - Subtarget->hasXMMInt(), DAG); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of // a vector. If Idx != 0, swizzle it into place. @@ -5132,7 +5083,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(Item.getValueType()), &Mask[0]); } - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); + return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -5141,21 +5092,33 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { - if (NumZero == 0) { + if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); - } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || + + if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { + if (VT.getSizeInBits() == 256) { + SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, + Item, DAG.getIntPtrConstant(0)); + } + assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. - return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(), - DAG); - } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { + return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } + + if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); - EVT MiddleVT = MVT::v4i32; - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, - Subtarget->hasXMMInt(), DAG); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); + if (VT.getSizeInBits() == 256) { + SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32), + DAG, dl); + } else { + assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -5183,8 +5146,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a shuffle of zero and zero-extended scalar to vector. - Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, - Subtarget->hasXMMInt(), DAG); + Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector<int, 8> MaskVec; for (unsigned i = 0; i < NumElems; i++) MaskVec.push_back(i == Idx ? 0 : 1); @@ -5214,9 +5176,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. - if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { + if (VT.getSizeInBits() == 256) { SmallVector<SDValue, 32> V; - for (unsigned i = 0; i < NumElems; ++i) + for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); @@ -5240,8 +5202,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned Idx = CountTrailingZeros_32(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); - return getShuffleVectorZeroOrUndef(V2, Idx, true, - Subtarget->hasXMMInt(), DAG); + return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } @@ -5249,24 +5210,23 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) { SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - *this); + Subtarget, *this); if (V.getNode()) return V; } if (EVTBits == 16 && NumElems == 8) { SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - *this); + Subtarget, *this); if (V.getNode()) return V; } // If element VT is == 32 bits, turn it into a number of shuffles. - SmallVector<SDValue, 8> V; - V.resize(NumElems); + SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1 << i)); if (isZero) - V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); + V[i] = getZeroVector(VT, Subtarget, DAG, dl); else V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } @@ -5289,13 +5249,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } } - SmallVector<int, 8> MaskVec; - bool Reverse = (NonZeros & 0x3) == 2; - for (unsigned i = 0; i < 2; ++i) - MaskVec.push_back(Reverse ? 1-i : i); - Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; - for (unsigned i = 0; i < 2; ++i) - MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); + bool Reverse1 = (NonZeros & 0x3) == 2; + bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; + int MaskVec[] = { + Reverse1 ? 1 : 0, + Reverse1 ? 0 : 1, + static_cast<int>(Reverse2 ? NumElems+1 : NumElems), + static_cast<int>(Reverse2 ? NumElems : NumElems+1) + }; return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } @@ -5310,7 +5271,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return LD; // For SSE 4.1, use insertps to put the high elements into the low element. - if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) { + if (getSubtarget()->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); @@ -5422,6 +5383,85 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return LowerAVXCONCAT_VECTORS(Op, DAG); } +// Try to lower a shuffle node into a simple blend instruction. +static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + EVT VT = Op.getValueType(); + EVT InVT = V1.getValueType(); + int MaskSize = VT.getVectorNumElements(); + int InSize = InVT.getVectorNumElements(); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (MaskSize != InSize) + return SDValue(); + + int ISDNo = 0; + MVT OpTy; + + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v8i16: + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v8i16; + break; + case MVT::v4i32: + case MVT::v4f32: + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v4f32; + break; + case MVT::v2i64: + case MVT::v2f64: + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v2f64; + break; + case MVT::v8i32: + case MVT::v8f32: + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v8f32; + break; + case MVT::v4i64: + case MVT::v4f64: + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v4f64; + break; + case MVT::v16i16: + if (!Subtarget->hasAVX2()) + return SDValue(); + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v16i16; + break; + } + assert(ISDNo && "Invalid Op Number"); + + unsigned MaskVals = 0; + + for (int i = 0; i < MaskSize; ++i) { + int EltIdx = SVOp->getMaskElt(i); + if (EltIdx == i || EltIdx == -1) + MaskVals |= (1<<i); + else if (EltIdx == (i + MaskSize)) + continue; // Bit is set to zero; + else return SDValue(); + } + + V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2); + SDValue Ret = DAG.getNode(ISDNo, dl, OpTy, V1, V2, + DAG.getConstant(MaskVals, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + // v8i16 shuffles - Prefer shuffles in the following order: // 1. [all] pshuflw, pshufhw, optional move // 2. [ssse3] 1 x pshufb @@ -5439,11 +5479,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // Determine if more than 1 of the words in each of the low and high quadwords // of the result come from the same quadword of one of the two inputs. Undef // mask values count as coming from any quadword, for better codegen. - SmallVector<unsigned, 4> LoQuad(4); - SmallVector<unsigned, 4> HiQuad(4); - BitVector InputQuads(4); + unsigned LoQuad[] = { 0, 0, 0, 0 }; + unsigned HiQuad[] = { 0, 0, 0, 0 }; + std::bitset<4> InputQuads; for (unsigned i = 0; i < 8; ++i) { - SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; + unsigned *Quad = i < 4 ? LoQuad : HiQuad; int EltIdx = SVOp->getMaskElt(i); MaskVals.push_back(EltIdx); if (EltIdx < 0) { @@ -5481,10 +5521,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + if (Subtarget->hasSSSE3()) { if (InputQuads.count() == 2 && V1Used && V2Used) { - BestLoQuad = InputQuads.find_first(); - BestHiQuad = InputQuads.find_next(BestLoQuad); + BestLoQuad = InputQuads[0] ? 0 : 1; + BestHiQuad = InputQuads[2] ? 2 : 3; } if (InputQuads.count() > 2) { BestLoQuad = -1; @@ -5497,9 +5537,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // words from all 4 input quadwords. SDValue NewV; if (BestLoQuad >= 0 || BestHiQuad >= 0) { - SmallVector<int, 8> MaskV; - MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); - MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); + int MaskV[] = { + BestLoQuad < 0 ? 0 : BestLoQuad, + BestHiQuad < 0 ? 1 : BestHiQuad + }; NewV = DAG.getVectorShuffle(MVT::v2i64, dl, DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); @@ -5544,8 +5585,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, unsigned TargetMask = 0; NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); - TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): - X86::getShufflePSHUFLWImmediate(NewV.getNode()); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); + TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): + getShufflePSHUFLWImmediate(SVOp); V1 = NewV.getOperand(0); return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); } @@ -5554,7 +5596,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + if (Subtarget->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If we have elements from both input vectors, set the high bit of the @@ -5602,61 +5644,51 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, // and update MaskVals with new element order. - BitVector InOrder(8); + std::bitset<8> InOrder; if (BestLoQuad >= 0) { - SmallVector<int, 8> MaskV; + int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; for (int i = 0; i != 4; ++i) { int idx = MaskVals[i]; if (idx < 0) { - MaskV.push_back(-1); InOrder.set(i); } else if ((idx / 4) == BestLoQuad) { - MaskV.push_back(idx & 3); + MaskV[i] = idx & 3; InOrder.set(i); - } else { - MaskV.push_back(-1); } } - for (unsigned i = 4; i != 8; ++i) - MaskV.push_back(i); NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && - (Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, - NewV.getOperand(0), - X86::getShufflePSHUFLWImmediate(NewV.getNode()), - DAG); + NewV.getOperand(0), + getShufflePSHUFLWImmediate(SVOp), DAG); + } } // If BestHi >= 0, generate a pshufhw to put the high elements in order, // and update MaskVals with the new element order. if (BestHiQuad >= 0) { - SmallVector<int, 8> MaskV; - for (unsigned i = 0; i != 4; ++i) - MaskV.push_back(i); + int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; for (unsigned i = 4; i != 8; ++i) { int idx = MaskVals[i]; if (idx < 0) { - MaskV.push_back(-1); InOrder.set(i); } else if ((idx / 4) == BestHiQuad) { - MaskV.push_back((idx & 3) + 4); + MaskV[i] = (idx & 3) + 4; InOrder.set(i); - } else { - MaskV.push_back(-1); } } NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && - (Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, - NewV.getOperand(0), - X86::getShufflePSHUFHWImmediate(NewV.getNode()), - DAG); + NewV.getOperand(0), + getShufflePSHUFHWImmediate(SVOp), DAG); + } } // In case BestHi & BestLo were both -1, which means each quadword has a word @@ -5698,8 +5730,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - SmallVector<int, 16> MaskVals; - SVOp->getMask(MaskVals); + ArrayRef<int> MaskVals = SVOp->getMask(); // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is @@ -5718,7 +5749,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) { + if (TLI.getSubtarget()->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If all result elements are from one input vector, then only translate @@ -5849,7 +5880,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, unsigned NewWidth = (NumElems == 4) ? 2 : 4; EVT NewVT; switch (VT.getSimpleVT().SimpleTy) { - default: assert(false && "Unexpected!"); + default: llvm_unreachable("Unexpected!"); case MVT::v4f32: NewVT = MVT::v2f64; break; case MVT::v4i32: NewVT = MVT::v2i64; break; case MVT::v8i16: NewVT = MVT::v4i32; break; @@ -5915,96 +5946,89 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, OpVT, SrcOp))); } -/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector -/// shuffle node referes to only one lane in the sources. -static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - int HalfSize = NumElems/2; - SmallVector<int, 16> M; - SVOp->getMask(M); - bool MatchA = false, MatchB = false; - - for (int l = 0; l < NumElems*2; l += HalfSize) { - if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) { - MatchA = true; - break; - } - } - - for (int l = 0; l < NumElems*2; l += HalfSize) { - if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) { - MatchB = true; - break; - } - } - - return MatchA && MatchB; -} - /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles /// which could not be matched by any known target speficic shuffle static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - if (areShuffleHalvesWithinDisjointLanes(SVOp)) { - // If each half of a vector shuffle node referes to only one lane in the - // source vectors, extract each used 128-bit lane and shuffle them using - // 128-bit shuffles. Then, concatenate the results. Otherwise leave - // the work to the legalizer. - DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - int HalfSize = NumElems/2; - - // Extract the reference for each half - int FstVecExtractIdx = 0, SndVecExtractIdx = 0; - int FstVecOpNum = 0, SndVecOpNum = 0; - for (int i = 0; i < HalfSize; ++i) { - int Elt = SVOp->getMaskElt(i); - if (SVOp->getMaskElt(i) < 0) - continue; - FstVecOpNum = Elt/NumElems; - FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; - break; - } - for (int i = HalfSize; i < NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - if (SVOp->getMaskElt(i) < 0) + EVT VT = SVOp->getValueType(0); + + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLaneElems = NumElems / 2; + + DebugLoc dl = SVOp->getDebugLoc(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); + SDValue Shufs[2]; + + SmallVector<int, 16> Mask; + for (unsigned l = 0; l < 2; ++l) { + // Build a shuffle mask for the output, discovering on the fly which + // input vectors to use as shuffle operands (recorded in InputUsed). + // If building a suitable shuffle vector proves too hard, then bail + // out with useBuildVector set. + int InputUsed[2] = { -1, -1 }; // Not yet discovered. + unsigned LaneStart = l * NumLaneElems; + for (unsigned i = 0; i != NumLaneElems; ++i) { + // The mask element. This indexes into the input. + int Idx = SVOp->getMaskElt(i+LaneStart); + if (Idx < 0) { + // the mask element does not index into any input vector. + Mask.push_back(-1); continue; - SndVecOpNum = Elt/NumElems; - SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; - break; - } + } - // Extract the subvectors - SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum), - DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl); - SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum), - DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl); + // The input vector this mask element indexes into. + int Input = Idx / NumLaneElems; - // Generate 128-bit shuffles - SmallVector<int, 16> MaskV1, MaskV2; - for (int i = 0; i < HalfSize; ++i) { - int Elt = SVOp->getMaskElt(i); - MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize); - } - for (int i = HalfSize; i < NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize); + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NumLaneElems; + + // Find or create a shuffle vector operand to hold this input. + unsigned OpNo; + for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { + if (InputUsed[OpNo] == Input) + // This input vector is already an operand. + break; + if (InputUsed[OpNo] < 0) { + // Create a new operand for this input vector. + InputUsed[OpNo] = Input; + break; + } + } + + if (OpNo >= array_lengthof(InputUsed)) { + // More than two input vectors used! Give up. + return SDValue(); + } + + // Add the mask index for the new shuffle vector. + Mask.push_back(Idx + OpNo * NumLaneElems); } - EVT NVT = V1.getValueType(); - V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]); - V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]); + if (InputUsed[0] < 0) { + // No input vectors were used! The result is undefined. + Shufs[l] = DAG.getUNDEF(NVT); + } else { + SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), + DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32), + DAG, dl); + // If only one input was used, use an undefined vector for the other. + SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : + Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), + DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32), + DAG, dl); + // At least one input vector was used. Create a new shuffle vector. + Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); + } - // Concatenate the result back - SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1, - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + Mask.clear(); } - return SDValue(); + // Concatenate the result back + SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0], + DAG.getConstant(0, MVT::i32), DAG, dl); + return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32), + DAG, dl); } /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with @@ -6018,11 +6042,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); - SmallVector<std::pair<int, int>, 8> Locs; - Locs.resize(4); - SmallVector<int, 8> Mask1(4U, -1); - SmallVector<int, 8> PermMask; - SVOp->getMask(PermMask); + std::pair<int, int> Locs[4]; + int Mask1[] = { -1, -1, -1, -1 }; + SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); unsigned NumHi = 0; unsigned NumLo = 0; @@ -6052,17 +6074,14 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // vector operands, put the elements into the right order. V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - SmallVector<int, 8> Mask2(4U, -1); + int Mask2[] = { -1, -1, -1, -1 }; - for (unsigned i = 0; i != 4; ++i) { - if (Locs[i].first == -1) - continue; - else { + for (unsigned i = 0; i != 4; ++i) + if (Locs[i].first != -1) { unsigned Idx = (i < 2) ? 0 : 4; Idx += Locs[i].first * 2 + Locs[i].second; Mask2[i] = Idx; } - } return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); } else if (NumLo == 3 || NumHi == 3) { @@ -6075,7 +6094,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // from X. if (NumHi == 3) { // Normalize it so the 3 elements come from V1. - CommuteVectorShuffleMask(PermMask, VT); + CommuteVectorShuffleMask(PermMask, 4); std::swap(V1, V2); } @@ -6115,18 +6134,16 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } // Break it into (shuffle shuffle_hi, shuffle_lo). - Locs.clear(); - Locs.resize(4); - SmallVector<int,8> LoMask(4U, -1); - SmallVector<int,8> HiMask(4U, -1); + int LoMask[] = { -1, -1, -1, -1 }; + int HiMask[] = { -1, -1, -1, -1 }; - SmallVector<int,8> *MaskPtr = &LoMask; + int *MaskPtr = LoMask; unsigned MaskIdx = 0; unsigned LoIdx = 0; unsigned HiIdx = 2; for (unsigned i = 0; i != 4; ++i) { if (i == 2) { - MaskPtr = &HiMask; + MaskPtr = HiMask; MaskIdx = 1; LoIdx = 0; HiIdx = 2; @@ -6136,26 +6153,21 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { Locs[i] = std::make_pair(-1, -1); } else if (Idx < 4) { Locs[i] = std::make_pair(MaskIdx, LoIdx); - (*MaskPtr)[LoIdx] = Idx; + MaskPtr[LoIdx] = Idx; LoIdx++; } else { Locs[i] = std::make_pair(MaskIdx, HiIdx); - (*MaskPtr)[HiIdx] = Idx; + MaskPtr[HiIdx] = Idx; HiIdx++; } } SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); - SmallVector<int, 8> MaskOps; - for (unsigned i = 0; i != 4; ++i) { - if (Locs[i].first == -1) { - MaskOps.push_back(-1); - } else { - unsigned Idx = Locs[i].first * 4 + Locs[i].second; - MaskOps.push_back(Idx); - } - } + int MaskOps[] = { -1, -1, -1, -1 }; + for (unsigned i = 0; i != 4; ++i) + if (Locs[i].first != -1) + MaskOps[i] = Locs[i].first * 4 + Locs[i].second; return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); } @@ -6164,6 +6176,10 @@ static bool MayFoldVectorLoad(SDValue V) { V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && + V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) + // BUILD_VECTOR (load), undef + V = V.getOperand(0); if (MayFoldLoad(V)) return true; return false; @@ -6186,82 +6202,6 @@ static bool RelaxedMayFoldVectorLoad(SDValue V) { return false; } -/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by -/// a vector extract, and if both can be later optimized into a single load. -/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked -/// here because otherwise a target specific shuffle node is going to be -/// emitted for this shuffle, and the optimization not done. -/// FIXME: This is probably not the best approach, but fix the problem -/// until the right path is decided. -static -bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, - const TargetLowering &TLI) { - EVT VT = V.getValueType(); - ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); - - // Be sure that the vector shuffle is present in a pattern like this: - // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) - if (!V.hasOneUse()) - return false; - - SDNode *N = *V.getNode()->use_begin(); - if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return false; - - SDValue EltNo = N->getOperand(1); - if (!isa<ConstantSDNode>(EltNo)) - return false; - - // If the bit convert changed the number of elements, it is unsafe - // to examine the mask. - bool HasShuffleIntoBitcast = false; - if (V.getOpcode() == ISD::BITCAST) { - EVT SrcVT = V.getOperand(0).getValueType(); - if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) - return false; - V = V.getOperand(0); - HasShuffleIntoBitcast = true; - } - - // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = VT.getVectorNumElements(); - unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); - V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); - - // Skip one more bit_convert if necessary - if (V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - if (ISD::isNormalLoad(V.getNode())) { - // Is the original load suitable? - LoadSDNode *LN0 = cast<LoadSDNode>(V); - - // FIXME: avoid the multi-use bug that is preventing lots of - // of foldings to be detected, this is still wrong of course, but - // give the temporary desired behavior, and if it happens that - // the load has real more uses, during isel it will not fold, and - // will generate poor code. - if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() - return false; - - if (!HasShuffleIntoBitcast) - return true; - - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - unsigned NewAlign = - TLI.getTargetData()->getABITypeAlignment( - VT.getTypeForEVT(*DAG.getContext())); - - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) - return false; - } - - return true; -} - static SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { EVT VT = Op.getValueType(); @@ -6275,14 +6215,14 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { static SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, - bool HasXMMInt) { + bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); assert(VT != MVT::v2i64 && "unsupported shuffle type"); - if (HasXMMInt && VT == MVT::v2f64) + if (HasSSE2 && VT == MVT::v2f64) return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) @@ -6308,24 +6248,8 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); } -static inline unsigned getSHUFPOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: - case MVT::v4i32: // Use fp unit for int unpack. - case MVT::v4f32: return X86ISD::SHUFPS; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: - case MVT::v2i64: // Use fp unit for int unpack. - case MVT::v2f64: return X86ISD::SHUFPD; - default: - llvm_unreachable("Unknown type for shufp*"); - } - return 0; -} - static -SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { +SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); @@ -6346,32 +6270,30 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { // turns into: // (MOVLPSmr addr:$src1, VR128:$src2) // So, recognize this potential and also use MOVLPS or MOVLPD - if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) + else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) CanFoldLoad = true; - // Both of them can't be memory operations though. - if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) - CanFoldLoad = false; - + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); if (CanFoldLoad) { - if (HasXMMInt && NumElems == 2) + if (HasSSE2 && NumElems == 2) return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); if (NumElems == 4) - return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); + // If we don't care about the second element, procede to use movss. + if (SVOp->getMaskElt(1) != -1) + return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); } - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); // movl and movlp will both match v2i64, but v2i64 is never matched by // movl earlier because we make it strict to avoid messing with the movlp load // folding logic (see the code above getMOVLP call). Match it here then, // this is horrible, but will stay like this until we move all shuffle // matching to x86 specific nodes. Note that for the 1st condition all // types are matched with movsd. - if (HasXMMInt) { + if (HasSSE2) { // FIXME: isMOVLMask should be checked and matched before getMOVLP, // as to remove this logic from here, as much as possible - if (NumElems == 2 || !X86::isMOVLMask(SVOp)) + if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); } @@ -6379,112 +6301,12 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { assert(VT != MVT::v4i32 && "unsupported shuffle type"); // Invert the operand order and use SHUFPS to match it. - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1, - X86::getShuffleSHUFImmediate(SVOp), DAG); -} - -static inline unsigned getUNPCKLOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: return X86ISD::PUNPCKLDQ; - case MVT::v2i64: return X86ISD::PUNPCKLQDQ; - case MVT::v4f32: return X86ISD::UNPCKLPS; - case MVT::v2f64: return X86ISD::UNPCKLPD; - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: return X86ISD::VUNPCKLPSY; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: return X86ISD::VUNPCKLPDY; - case MVT::v16i8: return X86ISD::PUNPCKLBW; - case MVT::v8i16: return X86ISD::PUNPCKLWD; - default: - llvm_unreachable("Unknown type for unpckl"); - } - return 0; -} - -static inline unsigned getUNPCKHOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: return X86ISD::PUNPCKHDQ; - case MVT::v2i64: return X86ISD::PUNPCKHQDQ; - case MVT::v4f32: return X86ISD::UNPCKHPS; - case MVT::v2f64: return X86ISD::UNPCKHPD; - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: return X86ISD::VUNPCKHPSY; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: return X86ISD::VUNPCKHPDY; - case MVT::v16i8: return X86ISD::PUNPCKHBW; - case MVT::v8i16: return X86ISD::PUNPCKHWD; - default: - llvm_unreachable("Unknown type for unpckh"); - } - return 0; -} - -static inline unsigned getVPERMILOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: - case MVT::v4f32: return X86ISD::VPERMILPS; - case MVT::v2i64: - case MVT::v2f64: return X86ISD::VPERMILPD; - case MVT::v8i32: - case MVT::v8f32: return X86ISD::VPERMILPSY; - case MVT::v4i64: - case MVT::v4f64: return X86ISD::VPERMILPDY; - default: - llvm_unreachable("Unknown type for vpermil"); - } - return 0; -} - -/// isVectorBroadcast - Check if the node chain is suitable to be xformed to -/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming -/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded. -static bool isVectorBroadcast(SDValue &Op) { - EVT VT = Op.getValueType(); - bool Is256 = VT.getSizeInBits() == 256; - - assert((VT.getSizeInBits() == 128 || Is256) && - "Unsupported type for vbroadcast node"); - - SDValue V = Op; - if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - if (Is256 && !(V.hasOneUse() && - V.getOpcode() == ISD::INSERT_SUBVECTOR && - V.getOperand(0).getOpcode() == ISD::UNDEF)) - return false; - - if (Is256) - V = V.getOperand(1); - - if (!V.hasOneUse()) - return false; - - // Check the source scalar_to_vector type. 256-bit broadcasts are - // supported for 32/64-bit sizes, while 128-bit ones are only supported - // for 32-bit scalars. - if (V.getOpcode() != ISD::SCALAR_TO_VECTOR) - return false; - - unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits(); - if (ScalarSize != 32 && ScalarSize != 64) - return false; - if (!Is256 && ScalarSize == 64) - return false; - - V = V.getOperand(0); - if (!MayFoldLoad(V)) - return false; - - // Return the load node - Op = V; - return true; + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, + getShuffleSHUFImmediate(SVOp), DAG); } -static -SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, - const TargetLowering &TLI, - const X86Subtarget *Subtarget) { +SDValue +X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); @@ -6492,22 +6314,17 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, SDValue V2 = Op.getOperand(1); if (isZeroShuffle(SVOp)) - return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); + return getZeroVector(VT, Subtarget, DAG, dl); // Handle splat operations if (SVOp->isSplat()) { unsigned NumElem = VT.getVectorNumElements(); int Size = VT.getSizeInBits(); - // Special case, this is the only place now where it's allowed to return - // a vector_shuffle operation without using a target specific node, because - // *hopefully* it will be optimized away by the dag combiner. FIXME: should - // this be moved to DAGCombine instead? - if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) - return Op; // Use vbroadcast whenever the splat comes from a foldable load - if (Subtarget->hasAVX() && isVectorBroadcast(V1)) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1); + SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + if (Broadcast.getNode()) + return Broadcast; // Handle splats by matching through known shuffle masks if ((Size == 128 && NumElem <= 4) || @@ -6525,21 +6342,26 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); } else if ((VT == MVT::v4i32 || - (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) { + (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) { - if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) - return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), + EVT NewVT = NewOp.getValueType(); + if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), + NewVT, true, false)) + return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); - if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) - return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), - DAG, Subtarget, dl); + if (NewOp.getNode()) { + EVT NewVT = NewOp.getValueType(); + if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) + return getVZextMovL(VT, NewVT, NewOp.getOperand(1), + DAG, Subtarget, dl); + } } } return SDValue(); @@ -6553,18 +6375,22 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); - bool isMMX = VT.getSizeInBits() == 64; bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; bool V2IsSplat = false; - bool HasXMMInt = Subtarget->hasXMMInt(); + bool HasSSE2 = Subtarget->hasSSE2(); + bool HasAVX = Subtarget->hasAVX(); + bool HasAVX2 = Subtarget->hasAVX2(); MachineFunction &MF = DAG.getMachineFunction(); bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); - // Shuffle operations on MMX not supported. - if (isMMX) - return Op; + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); // Vector shuffle lowering takes 3 steps: // @@ -6576,50 +6402,54 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // so the shuffle can be broken into other shuffles and the legalizer can // try the lowering again. // - // The general ideia is that no vector_shuffle operation should be left to + // The general idea is that no vector_shuffle operation should be left to // be matched during isel, all of them must be converted to a target specific // node here. // Normalize the input vectors. Here splats, zeroed vectors, profitable // narrowing and commutation of operands should be handled. The actual code // doesn't include all of those, work in progress... - SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); + SDValue NewOp = NormalizeVectorShuffle(Op, DAG); if (NewOp.getNode()) return NewOp; + SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); + // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and // unpckh_undef). Only use pshufd if speed is more important than size. - if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); - if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - if (X86::isMOVDDUPMask(SVOp) && - (Subtarget->hasSSE3() || Subtarget->hasAVX()) && + if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && V2IsUndef && RelaxedMayFoldVectorLoad(V1)) return getMOVDDup(Op, dl, V1, DAG); - if (X86::isMOVHLPS_v_undef_Mask(SVOp)) + if (isMOVHLPS_v_undef_Mask(M, VT)) return getMOVHighToLow(Op, dl, DAG); // Use to match splats - if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef && + if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef && (VT == MVT::v2f64 || VT == MVT::v2i64)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - if (X86::isPSHUFDMask(SVOp)) { + if (isPSHUFDMask(M, VT)) { // The actual implementation will match the mask in the if above and then // during isel it can match several different instructions, not only pshufd // as its name says, sad but true, emulate the behavior for now... - if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) - return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) + return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + + unsigned TargetMask = getShuffleSHUFImmediate(SVOp); - unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); + if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG); - if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32)) + if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1, + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, TargetMask, DAG); } @@ -6627,8 +6457,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool isLeft = false; unsigned ShAmt = 0; SDValue ShVal; - bool isShift = getSubtarget()->hasXMMInt() && - isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. @@ -6637,13 +6466,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } - if (X86::isMOVLMask(SVOp)) { - if (V1IsUndef) - return V2; + if (isMOVLMask(M, VT)) { if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); - if (!X86::isMOVLPMask(SVOp)) { - if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64)) + if (!isMOVLPMask(M, VT)) { + if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); if (VT == MVT::v4i32 || VT == MVT::v4f32) @@ -6652,27 +6479,27 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // FIXME: fold these into legal mask. - if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) - return getMOVLowToHigh(Op, dl, DAG, HasXMMInt); + if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2)) + return getMOVLowToHigh(Op, dl, DAG, HasSSE2); - if (X86::isMOVHLPSMask(SVOp)) + if (isMOVHLPSMask(M, VT)) return getMOVHighToLow(Op, dl, DAG); - if (X86::isMOVSHDUPMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); - if (X86::isMOVSLDUPMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); - if (X86::isMOVLPMask(SVOp)) - return getMOVLP(Op, dl, DAG, HasXMMInt); + if (isMOVLPMask(M, VT)) + return getMOVLP(Op, dl, DAG, HasSSE2); - if (ShouldXformToMOVHLPS(SVOp) || - ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) + if (ShouldXformToMOVHLPS(M, VT) || + ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) return CommuteVectorShuffle(SVOp, DAG); if (isShift) { - // No better options. Use a vshl / vsrl. + // No better options. Use a vshldq / vsrldq. EVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); @@ -6685,17 +6512,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { V2IsSplat = isSplatVector(V2.getNode()); // Canonicalize the splat or undef, if present, to be on the RHS. - if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { - Op = CommuteVectorShuffle(SVOp, DAG); - SVOp = cast<ShuffleVectorSDNode>(Op); - V1 = SVOp->getOperand(0); - V2 = SVOp->getOperand(1); + if (!V2IsUndef && V1IsSplat && !V2IsSplat) { + CommuteVectorShuffleMask(M, NumElems); + std::swap(V1, V2); std::swap(V1IsSplat, V2IsSplat); - std::swap(V1IsUndef, V2IsUndef); Commuted = true; } - if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { + if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { // Shuffling low element of v1 into undef, just return v1. if (V2IsUndef) return V1; @@ -6705,81 +6529,77 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getMOVL(DAG, dl, VT, V2, V1); } - if (X86::isUNPCKLMask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); + if (isUNPCKLMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (X86::isUNPCKHMask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); + if (isUNPCKHMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); if (V2IsSplat) { // Normalize mask so all entries that point to V2 points to its first // element then try to match unpck{h|l} again. If match, return a - // new vector_shuffle with the corrected mask. - SDValue NewMask = NormalizeMask(SVOp, DAG); - ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); - if (NSVOp != SVOp) { - if (X86::isUNPCKLMask(NSVOp, true)) { - return NewMask; - } else if (X86::isUNPCKHMask(NSVOp, true)) { - return NewMask; - } + // new vector_shuffle with the corrected mask.p + SmallVector<int, 8> NewMask(M.begin(), M.end()); + NormalizeMask(NewMask, NumElems); + if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) { + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); + } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) { + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } } if (Commuted) { // Commute is back and try unpck* again. // FIXME: this seems wrong. - SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); - ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); + CommuteVectorShuffleMask(M, NumElems); + std::swap(V1, V2); + std::swap(V1IsSplat, V2IsSplat); + Commuted = false; - if (X86::isUNPCKLMask(NewSVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); + if (isUNPCKLMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (X86::isUNPCKHMask(NewSVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); + if (isUNPCKHMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } // Normalize the node to match x86 shuffle ops if needed - if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) + if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. - SmallVector<int, 16> M; - SVOp->getMask(M); - if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (isPALIGNRMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, - X86::getShufflePALIGNRImmediate(SVOp), + getShufflePALIGNRImmediate(SVOp), DAG); if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { - if (VT == MVT::v2f64) - return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); - if (VT == MVT::v2i64) - return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); + if (VT == MVT::v2f64 || VT == MVT::v2i64) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); } if (isPSHUFHWMask(M, VT)) return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, - X86::getShufflePSHUFHWImmediate(SVOp), + getShufflePSHUFHWImmediate(SVOp), DAG); if (isPSHUFLWMask(M, VT)) return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, - X86::getShufflePSHUFLWImmediate(SVOp), + getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - X86::getShuffleSHUFImmediate(SVOp), DAG); + if (isSHUFPMask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, + getShuffleSHUFImmediate(SVOp), DAG); - if (X86::isUNPCKL_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); - if (X86::isUNPCKH_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); //===--------------------------------------------------------------------===// // Generate target specific nodes for 128 or 256-bit shuffles only @@ -6787,33 +6607,26 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // // Handle VMOVDDUPY permutations - if (isMOVDDUPYMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); - // Handle VPERMILPS* permutations - if (isVPERMILPSMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPSImmediate(SVOp), DAG); - - // Handle VPERMILPD* permutations - if (isVPERMILPDMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPDImmediate(SVOp), DAG); - - // Handle VPERM2F128 permutations - if (isVPERM2F128Mask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, - getShuffleVPERM2F128Immediate(SVOp), DAG); + // Handle VPERMILPS/D* permutations + if (isVPERMILPMask(M, VT, HasAVX)) { + if (HasAVX2 && VT == MVT::v8i32) + return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, + getShuffleSHUFImmediate(SVOp), DAG); + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + getShuffleSHUFImmediate(SVOp), DAG); + } - // Handle VSHUFPSY permutations - if (isVSHUFPSYMask(M, VT, Subtarget)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPSYImmediate(SVOp), DAG); + // Handle VPERM2F128/VPERM2I128 permutations + if (isVPERM2X128Mask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, + V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - // Handle VSHUFPDY permutations - if (isVSHUFPDYMask(M, VT, Subtarget)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPDYImmediate(SVOp), DAG); + SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, @@ -6896,8 +6709,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, Op.getOperand(0)), Op.getOperand(1)); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); - } else if (VT == MVT::i32) { - // ExtractPS works with constant index. + } else if (VT == MVT::i32 || VT == MVT::i64) { + // ExtractPS/pextrq works with constant index. if (isa<ConstantSDNode>(Op.getOperand(1))) return Op; } @@ -6933,7 +6746,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { + if (Subtarget->hasSSE41()) { SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); if (Res.getNode()) return Res; @@ -7036,7 +6849,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); - } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { + } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && + isa<ConstantSDNode>(N2)) { // PINSR* works with constant index. return Op; } @@ -7074,7 +6888,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); } - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) + if (Subtarget->hasSSE41()) return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); if (EltVT == MVT::i8) @@ -7276,7 +7090,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); return Result; } @@ -7344,7 +7158,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -7423,7 +7237,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), - MachinePointerInfo(Ptr), false, false, 0); + MachinePointerInfo(Ptr), + false, false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is @@ -7449,7 +7264,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, if (model == TLSModel::InitialExec) Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. @@ -7471,8 +7286,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) GV = GA->resolveAliasedGlobal(false); - TLSModel::Model model - = getTLSModel(GV, getTargetMachine().getRelocationModel()); + TLSModel::Model model = getTargetMachine().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: @@ -7529,19 +7343,77 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), Chain.getValue(1)); - } + } else if (Subtarget->isTargetWindows()) { + // Just use the implicit TLS architecture + // Need to generate someting similar to: + // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage + // ; from TEB + // mov ecx, dword [rel _tls_index]: Load index (from C runtime) + // mov rcx, qword [rdx+rcx*8] + // mov eax, .tls$:tlsvar + // [rax+rcx] contains the address + // Windows 64bit: gs:0x58 + // Windows 32bit: fs:__tls_array - assert(false && - "TLS not implemented for this target."); + // If GV is an alias then use the aliasee for determining + // thread-localness. + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + GV = GA->resolveAliasedGlobal(false); + DebugLoc dl = GA->getDebugLoc(); + SDValue Chain = DAG.getEntryNode(); - llvm_unreachable("Unreachable"); - return SDValue(); + // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or + // %gs:0x58 (64-bit). + Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() + ? Type::getInt8PtrTy(*DAG.getContext(), + 256) + : Type::getInt32PtrTy(*DAG.getContext(), + 257)); + + SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, + Subtarget->is64Bit() + ? DAG.getIntPtrConstant(0x58) + : DAG.getExternalSymbol("_tls_array", + getPointerTy()), + MachinePointerInfo(Ptr), + false, false, false, 0); + + // Load the _tls_index variable + SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); + if (Subtarget->is64Bit()) + IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, + IDX, MachinePointerInfo(), MVT::i32, + false, false, 0); + else + IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), + false, false, false, 0); + + SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), + getPointerTy()); + IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); + + SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); + res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), + false, false, false, 0); + + // Get the offset of start of .tls section + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), X86II::MO_SECREL); + SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); + } + + llvm_unreachable("TLS not implemented for this target."); } -/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and -/// take a 2 x i32 value to shift plus a shift amount. -SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values +/// and take a 2 x i32 value to shift plus a shift amount. +SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -7673,7 +7545,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, Op.getValueType(), MMO); Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + false, false, false, 0); } return Result; @@ -7682,85 +7554,65 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const { - // This algorithm is not obvious. Here it is in C code, more or less: + // This algorithm is not obvious. Here it is what we're trying to output: /* - double uint64_to_double( uint32_t hi, uint32_t lo ) { - static const __m128i exp = { 0x4330000045300000ULL, 0 }; - static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; - - // Copy ints to xmm registers. - __m128i xh = _mm_cvtsi32_si128( hi ); - __m128i xl = _mm_cvtsi32_si128( lo ); - - // Combine into low half of a single xmm register. - __m128i x = _mm_unpacklo_epi32( xh, xl ); - __m128d d; - double sd; - - // Merge in appropriate exponents to give the integer bits the right - // magnitude. - x = _mm_unpacklo_epi32( x, exp ); - - // Subtract away the biases to deal with the IEEE-754 double precision - // implicit 1. - d = _mm_sub_pd( (__m128d) x, bias ); - - // All conversions up to here are exact. The correctly rounded result is - // calculated using the current rounding mode using the following - // horizontal add. - d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); - _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this - // store doesn't really need to be here (except - // maybe to zero the other double) - return sd; - } + movq %rax, %xmm0 + punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } + subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } + #ifdef __SSE3__ + haddpd %xmm0, %xmm0 + #else + pshufd $0x4e, %xmm0, %xmm1 + addpd %xmm1, %xmm0 + #endif */ DebugLoc dl = Op.getDebugLoc(); LLVMContext *Context = DAG.getContext(); // Build some magic constants. - std::vector<Constant*> CV0; - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); - Constant *C0 = ConstantVector::get(CV0); + const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + Constant *C0 = ConstantDataVector::get(*Context, CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); - std::vector<Constant*> CV1; + SmallVector<Constant*,2> CV1; CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Op.getOperand(0), - DAG.getIntPtrConstant(1))); - SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Op.getOperand(0), - DAG.getIntPtrConstant(0))); - SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); + // Load the 64-bit value into an XMM register. + SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Op.getOperand(0)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(), - false, false, 16); - SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); - SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); + false, false, false, 16); + SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), + CLod0); + SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); + SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + SDValue Result; + + if (Subtarget->hasSSE3()) { + // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. + Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); + } else { + SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); + SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, + S2F, 0x4E, DAG); + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), + Sub); + } - // Add the halves; easiest way is to swap them into another reg first. - int ShufMask[2] = { 1, -1 }; - SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, - DAG.getUNDEF(MVT::v2f64), ShufMask); - SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0)); } @@ -7777,8 +7629,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, Op.getOperand(0)); // Zero out the upper parts of the register. - Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(), - DAG); + Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), @@ -7830,6 +7681,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return LowerUINT_TO_FP_i64(Op, DAG); else if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); + else if (Subtarget->is64Bit() && + SrcVT == MVT::i64 && DstVT == MVT::f32) + return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); @@ -7849,7 +7703,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo(), + StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in @@ -7897,19 +7751,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } std::pair<SDValue,SDValue> X86TargetLowering:: -FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { +FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { DebugLoc DL = Op.getDebugLoc(); EVT DstTy = Op.getValueType(); - if (!IsSigned) { + if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && - "Unknown FP_TO_SINT to lower!"); + "Unknown FP_TO_INT to lower!"); // These are really Legal. if (DstTy == MVT::i32 && @@ -7920,26 +7774,29 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); - // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary - // stack slot. + // We lower FP->int64 either into FISTP64 followed by a load from a temporary + // stack slot, or into the FTOL runtime function. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); - - unsigned Opc; - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } + if (!IsSigned && isIntegerTypeFTOL(DstTy)) + Opc = X86ISD::WIN_FTOL; + else + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); EVT TheVT = Op.getOperand(0).getValueType(); + // FIXME This causes a redundant load/store if the SSE-class value is already + // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, @@ -7964,12 +7821,26 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOStore, MemSize, MemSize); - // Build the FP_TO_INT*_IN_MEM - SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, 3, DstTy, MMO); - - return std::make_pair(FIST, StackSlot); + if (Opc != X86ISD::WIN_FTOL) { + // Build the FP_TO_INT*_IN_MEM + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + Ops, 3, DstTy, MMO); + return std::make_pair(FIST, StackSlot); + } else { + SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, + DAG.getVTList(MVT::Other, MVT::Glue), + Chain, Value); + SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, + MVT::i32, ftol.getValue(1)); + SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, + MVT::i32, eax.getValue(2)); + SDValue Ops[] = { eax, edx }; + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) + : DAG.getMergeValues(Ops, 2, DL); + return std::make_pair(pair, SDValue()); + } } SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, @@ -7977,25 +7848,37 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, if (Op.getValueType().isVector()) return SDValue(); - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. if (FIST.getNode() == 0) return Op; - // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, MachinePointerInfo(), false, false, 0); + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + else + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; assert(FIST.getNode() && "Unexpected failure"); - // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, MachinePointerInfo(), false, false, 0); + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + else + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFABS(SDValue Op, @@ -8006,23 +7889,18 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, EVT EltVT = VT; if (VT.isVector()) EltVT = VT.getVectorElementType(); - std::vector<Constant*> CV; + Constant *C; if (EltVT == MVT::f64) { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); - CV.push_back(C); - CV.push_back(C); + C = ConstantVector::getSplat(2, + ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); } else { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); + C = ConstantVector::getSplat(4, + ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); } - Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -8031,31 +7909,28 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT EltVT = VT; - if (VT.isVector()) + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { EltVT = VT.getVectorElementType(); - std::vector<Constant*> CV; - if (EltVT == MVT::f64) { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); - CV.push_back(C); - CV.push_back(C); - } else { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); + NumElts = VT.getVectorNumElements(); } - Constant *C = ConstantVector::get(CV); + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); + else + C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); + C = ConstantVector::getSplat(NumElts, C); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); if (VT.isVector()) { + MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::XOR, dl, MVT::v2i64, - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getNode(ISD::XOR, dl, XORVT, + DAG.getNode(ISD::BITCAST, dl, XORVT, Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); + DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); } else { return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } @@ -8084,7 +7959,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // type, and that won't be f80 since that is not custom lowered. // First get the sign bit of second operand. - std::vector<Constant*> CV; + SmallVector<Constant*,4> CV; if (SrcVT == MVT::f64) { CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); @@ -8098,7 +7973,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); // Shift sign bit right or left if the two operands have different types. @@ -8127,7 +8002,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); // Or the value with the sign bit. @@ -8191,8 +8066,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, // climbing the DAG back to the root, and it doesn't seem to be worth the // effort. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) - if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() != ISD::CopyToReg && + UI->getOpcode() != ISD::SETCC && + UI->getOpcode() != ISD::STORE) goto default_case; if (ConstantSDNode *C = @@ -8325,8 +8202,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { - APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; - DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); + APInt Zeros, Ones; + DAG.ComputeMaskedBits(Op0, Zeros, Ones); if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) return SDValue(); } @@ -8335,11 +8212,19 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); + uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; - if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { + + if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { LHS = AndLHS.getOperand(0); RHS = AndLHS.getOperand(1); } + + // Use BT if the immediate can't be encoded in a TEST instruction. + if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { + LHS = AndLHS; + RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); + } } if (LHS.getNode()) { @@ -8466,9 +8351,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (isFP) { unsigned SSECC = 8; EVT EltVT = Op0.getValueType().getVectorElementType(); - assert(EltVT == MVT::f32 || EltVT == MVT::f64); + assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT; - unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; bool Swap = false; // SSE Condition code mapping: @@ -8508,61 +8392,57 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (SSECC == 8) { if (SetCCOpcode == ISD::SETUEQ) { SDValue UNORD, EQ; - UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); - EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); + UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(3, MVT::i8)); + EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(0, MVT::i8)); return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); - } - else if (SetCCOpcode == ISD::SETONE) { + } else if (SetCCOpcode == ISD::SETONE) { SDValue ORD, NEQ; - ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); - NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); + ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(7, MVT::i8)); + NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(4, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); } llvm_unreachable("Illegal FP comparison"); } // Handle all other FP comparisons here. - return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); + return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, MVT::i8)); } // Break 256-bit integer vector compare into smaller ones. - if (!isFP && VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) return Lower256IntVSETCC(Op, DAG); // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. - unsigned Opc = 0, EQOpc = 0, GTOpc = 0; + unsigned Opc = 0; bool Swap = false, Invert = false, FlipSigns = false; - switch (VT.getSimpleVT().SimpleTy) { - default: break; - case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; - case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; - case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; - case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; - } - switch (SetCCOpcode) { default: break; case ISD::SETNE: Invert = true; - case ISD::SETEQ: Opc = EQOpc; break; + case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; - case ISD::SETGT: Opc = GTOpc; break; + case ISD::SETGT: Opc = X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; - case ISD::SETLE: Opc = GTOpc; Invert = true; break; + case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; case ISD::SETULT: Swap = true; - case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; + case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; case ISD::SETUGE: Swap = true; - case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; + case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). - if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX()) + if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42()) return SDValue(); - if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX()) + if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41()) return SDValue(); // Since SSE has no unsigned integer comparisons, we need to flip the sign @@ -8679,8 +8559,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. - if (Cond.getOpcode() == X86ISD::SETCC || - Cond.getOpcode() == X86ISD::SETCC_CARRY) { + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); @@ -8697,6 +8578,39 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond = Cmp; addTest = false; } + } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, MVT::i8); + addTest = false; } if (addTest) { @@ -8778,11 +8692,27 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); SDValue CC; + bool Inverted = false; if (Cond.getOpcode() == ISD::SETCC) { - SDValue NewCond = LowerSETCC(Cond, DAG); - if (NewCond.getNode()) - Cond = NewCond; + // Check for setcc([su]{add,sub,mul}o == 0). + if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && + isa<ConstantSDNode>(Cond.getOperand(1)) && + cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && + Cond.getOperand(0).getResNo() == 1 && + (Cond.getOperand(0).getOpcode() == ISD::SADDO || + Cond.getOperand(0).getOpcode() == ISD::UADDO || + Cond.getOperand(0).getOpcode() == ISD::SSUBO || + Cond.getOperand(0).getOpcode() == ISD::USUBO || + Cond.getOperand(0).getOpcode() == ISD::SMULO || + Cond.getOperand(0).getOpcode() == ISD::UMULO)) { + Inverted = true; + Cond = Cond.getOperand(0); + } else { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } } #if 0 // FIXME: LowerXALUO doesn't handle these!! @@ -8803,8 +8733,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. - if (Cond.getOpcode() == X86ISD::SETCC || - Cond.getOpcode() == X86ISD::SETCC_CARRY) { + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); @@ -8825,6 +8756,43 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { break; } } + } + CondOpcode = Cond.getOpcode(); + if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (Inverted) + X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, MVT::i8); + addTest = false; } else { unsigned CondOpc; if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { @@ -8888,6 +8856,66 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { CC = DAG.getConstant(CCode, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { + // For FCMP_OEQ, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + Dest = FalseBB; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_P, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { + // For FCMP_UNE, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_UNE. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_NP, MVT::i8); + Cond = Cmp; + addTest = false; + Dest = FalseBB; + } + } } } @@ -8926,7 +8954,7 @@ SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || - EnableSegmentedStacks) && + getTargetMachine().Options.EnableSegmentedStacks) && "This should be used only on Windows targets or when segmented stacks " "are being used"); assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); @@ -8940,7 +8968,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; - if (EnableSegmentedStacks) { + if (getTargetMachine().Options.EnableSegmentedStacks) { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -9076,10 +9104,10 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. - assert(!UseSoftFloat && + assert(!getTargetMachine().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && - Subtarget->hasXMM()); + Subtarget->hasSSE1()); } // Insert VAARG_64 node into the DAG @@ -9106,7 +9134,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { Chain, VAARG, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); } SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { @@ -9125,6 +9153,43 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } +// getTargetVShiftNOde - Handle vector element shifts where the shift amount +// may or may not be a constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue SrcOp, SDValue ShAmt, + SelectionDAG &DAG) { + assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); + + if (isa<ConstantSDNode>(ShAmt)) { + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: + return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); + } + } + + // Change opcode to non-immediate version + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; + case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; + case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; + } + + // Need to build a vector containing shift amount + // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 + SDValue ShOps[4]; + ShOps[0] = ShAmt; + ShOps[1] = DAG.getConstant(0, MVT::i32); + ShOps[2] = DAG.getUNDEF(MVT::i32); + ShOps[3] = DAG.getUNDEF(MVT::i32); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); + ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); + return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); +} + SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -9159,7 +9224,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const unsigned Opc = 0; ISD::CondCode CC = ISD::SETCC_INVALID; switch (IntNo) { - default: break; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_comieq_ss: case Intrinsic::x86_sse2_comieq_sd: Opc = X86ISD::COMI; @@ -9231,7 +9296,201 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(X86CC, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + // XOP comparison intrinsics + case Intrinsic::x86_xop_vpcomltb: + case Intrinsic::x86_xop_vpcomltw: + case Intrinsic::x86_xop_vpcomltd: + case Intrinsic::x86_xop_vpcomltq: + case Intrinsic::x86_xop_vpcomltub: + case Intrinsic::x86_xop_vpcomltuw: + case Intrinsic::x86_xop_vpcomltud: + case Intrinsic::x86_xop_vpcomltuq: + case Intrinsic::x86_xop_vpcomleb: + case Intrinsic::x86_xop_vpcomlew: + case Intrinsic::x86_xop_vpcomled: + case Intrinsic::x86_xop_vpcomleq: + case Intrinsic::x86_xop_vpcomleub: + case Intrinsic::x86_xop_vpcomleuw: + case Intrinsic::x86_xop_vpcomleud: + case Intrinsic::x86_xop_vpcomleuq: + case Intrinsic::x86_xop_vpcomgtb: + case Intrinsic::x86_xop_vpcomgtw: + case Intrinsic::x86_xop_vpcomgtd: + case Intrinsic::x86_xop_vpcomgtq: + case Intrinsic::x86_xop_vpcomgtub: + case Intrinsic::x86_xop_vpcomgtuw: + case Intrinsic::x86_xop_vpcomgtud: + case Intrinsic::x86_xop_vpcomgtuq: + case Intrinsic::x86_xop_vpcomgeb: + case Intrinsic::x86_xop_vpcomgew: + case Intrinsic::x86_xop_vpcomged: + case Intrinsic::x86_xop_vpcomgeq: + case Intrinsic::x86_xop_vpcomgeub: + case Intrinsic::x86_xop_vpcomgeuw: + case Intrinsic::x86_xop_vpcomgeud: + case Intrinsic::x86_xop_vpcomgeuq: + case Intrinsic::x86_xop_vpcomeqb: + case Intrinsic::x86_xop_vpcomeqw: + case Intrinsic::x86_xop_vpcomeqd: + case Intrinsic::x86_xop_vpcomeqq: + case Intrinsic::x86_xop_vpcomequb: + case Intrinsic::x86_xop_vpcomequw: + case Intrinsic::x86_xop_vpcomequd: + case Intrinsic::x86_xop_vpcomequq: + case Intrinsic::x86_xop_vpcomneb: + case Intrinsic::x86_xop_vpcomnew: + case Intrinsic::x86_xop_vpcomned: + case Intrinsic::x86_xop_vpcomneq: + case Intrinsic::x86_xop_vpcomneub: + case Intrinsic::x86_xop_vpcomneuw: + case Intrinsic::x86_xop_vpcomneud: + case Intrinsic::x86_xop_vpcomneuq: + case Intrinsic::x86_xop_vpcomfalseb: + case Intrinsic::x86_xop_vpcomfalsew: + case Intrinsic::x86_xop_vpcomfalsed: + case Intrinsic::x86_xop_vpcomfalseq: + case Intrinsic::x86_xop_vpcomfalseub: + case Intrinsic::x86_xop_vpcomfalseuw: + case Intrinsic::x86_xop_vpcomfalseud: + case Intrinsic::x86_xop_vpcomfalseuq: + case Intrinsic::x86_xop_vpcomtrueb: + case Intrinsic::x86_xop_vpcomtruew: + case Intrinsic::x86_xop_vpcomtrued: + case Intrinsic::x86_xop_vpcomtrueq: + case Intrinsic::x86_xop_vpcomtrueub: + case Intrinsic::x86_xop_vpcomtrueuw: + case Intrinsic::x86_xop_vpcomtrueud: + case Intrinsic::x86_xop_vpcomtrueuq: { + unsigned CC = 0; + unsigned Opc = 0; + + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_xop_vpcomltb: + case Intrinsic::x86_xop_vpcomltw: + case Intrinsic::x86_xop_vpcomltd: + case Intrinsic::x86_xop_vpcomltq: + CC = 0; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomltub: + case Intrinsic::x86_xop_vpcomltuw: + case Intrinsic::x86_xop_vpcomltud: + case Intrinsic::x86_xop_vpcomltuq: + CC = 0; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomleb: + case Intrinsic::x86_xop_vpcomlew: + case Intrinsic::x86_xop_vpcomled: + case Intrinsic::x86_xop_vpcomleq: + CC = 1; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomleub: + case Intrinsic::x86_xop_vpcomleuw: + case Intrinsic::x86_xop_vpcomleud: + case Intrinsic::x86_xop_vpcomleuq: + CC = 1; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomgtb: + case Intrinsic::x86_xop_vpcomgtw: + case Intrinsic::x86_xop_vpcomgtd: + case Intrinsic::x86_xop_vpcomgtq: + CC = 2; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomgtub: + case Intrinsic::x86_xop_vpcomgtuw: + case Intrinsic::x86_xop_vpcomgtud: + case Intrinsic::x86_xop_vpcomgtuq: + CC = 2; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomgeb: + case Intrinsic::x86_xop_vpcomgew: + case Intrinsic::x86_xop_vpcomged: + case Intrinsic::x86_xop_vpcomgeq: + CC = 3; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomgeub: + case Intrinsic::x86_xop_vpcomgeuw: + case Intrinsic::x86_xop_vpcomgeud: + case Intrinsic::x86_xop_vpcomgeuq: + CC = 3; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomeqb: + case Intrinsic::x86_xop_vpcomeqw: + case Intrinsic::x86_xop_vpcomeqd: + case Intrinsic::x86_xop_vpcomeqq: + CC = 4; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomequb: + case Intrinsic::x86_xop_vpcomequw: + case Intrinsic::x86_xop_vpcomequd: + case Intrinsic::x86_xop_vpcomequq: + CC = 4; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomneb: + case Intrinsic::x86_xop_vpcomnew: + case Intrinsic::x86_xop_vpcomned: + case Intrinsic::x86_xop_vpcomneq: + CC = 5; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomneub: + case Intrinsic::x86_xop_vpcomneuw: + case Intrinsic::x86_xop_vpcomneud: + case Intrinsic::x86_xop_vpcomneuq: + CC = 5; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomfalseb: + case Intrinsic::x86_xop_vpcomfalsew: + case Intrinsic::x86_xop_vpcomfalsed: + case Intrinsic::x86_xop_vpcomfalseq: + CC = 6; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomfalseub: + case Intrinsic::x86_xop_vpcomfalseuw: + case Intrinsic::x86_xop_vpcomfalseud: + case Intrinsic::x86_xop_vpcomfalseuq: + CC = 6; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomtrueb: + case Intrinsic::x86_xop_vpcomtruew: + case Intrinsic::x86_xop_vpcomtrued: + case Intrinsic::x86_xop_vpcomtrueq: + CC = 7; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomtrueub: + case Intrinsic::x86_xop_vpcomtrueuw: + case Intrinsic::x86_xop_vpcomtrueud: + case Intrinsic::x86_xop_vpcomtrueuq: + CC = 7; + Opc = X86ISD::VPCOMU; + break; + } + + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS, + DAG.getConstant(CC, MVT::i8)); + } + // Arithmetic intrinsics. + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_avx2_pmulu_dq: + return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse3_hadd_ps: case Intrinsic::x86_sse3_hadd_pd: case Intrinsic::x86_avx_hadd_ps_256: @@ -9244,6 +9503,62 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx_hsub_pd_256: return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_phadd_w_128: + case Intrinsic::x86_ssse3_phadd_d_128: + case Intrinsic::x86_avx2_phadd_w: + case Intrinsic::x86_avx2_phadd_d: + return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_phsub_w_128: + case Intrinsic::x86_ssse3_phsub_d_128: + case Intrinsic::x86_avx2_phsub_w: + case Intrinsic::x86_avx2_phsub_d: + return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q_256: + return DAG.getNode(ISD::SHL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q_256: + return DAG.getNode(ISD::SRL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + return DAG.getNode(ISD::SRA, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_psign_b_128: + case Intrinsic::x86_ssse3_psign_w_128: + case Intrinsic::x86_ssse3_psign_d_128: + case Intrinsic::x86_avx2_psign_b: + case Intrinsic::x86_avx2_psign_w: + case Intrinsic::x86_avx2_psign_d: + return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_insertps: + return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx_vperm2f128_ps_256: + case Intrinsic::x86_avx_vperm2f128_pd_256: + case Intrinsic::x86_avx_vperm2f128_si_256: + case Intrinsic::x86_avx2_vperm2i128: + return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx_vpermil_ps: + case Intrinsic::x86_avx_vpermil_pd: + case Intrinsic::x86_avx_vpermil_ps_256: + case Intrinsic::x86_avx_vpermil_pd_256: + return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. @@ -9310,16 +9625,53 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // Fix vector shift instructions where the last operand is a non-immediate - // i32 value. + // SSE/AVX shift intrinsics + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: + return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); + // Fix vector shift instructions where the last operand is a non-immediate + // i32 value. case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: @@ -9333,79 +9685,40 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const return SDValue(); unsigned NewIntNo = 0; - EVT ShAmtVT = MVT::v4i32; switch (IntNo) { - case Intrinsic::x86_sse2_pslli_w: - NewIntNo = Intrinsic::x86_sse2_psll_w; - break; - case Intrinsic::x86_sse2_pslli_d: - NewIntNo = Intrinsic::x86_sse2_psll_d; + case Intrinsic::x86_mmx_pslli_w: + NewIntNo = Intrinsic::x86_mmx_psll_w; break; - case Intrinsic::x86_sse2_pslli_q: - NewIntNo = Intrinsic::x86_sse2_psll_q; + case Intrinsic::x86_mmx_pslli_d: + NewIntNo = Intrinsic::x86_mmx_psll_d; break; - case Intrinsic::x86_sse2_psrli_w: - NewIntNo = Intrinsic::x86_sse2_psrl_w; + case Intrinsic::x86_mmx_pslli_q: + NewIntNo = Intrinsic::x86_mmx_psll_q; break; - case Intrinsic::x86_sse2_psrli_d: - NewIntNo = Intrinsic::x86_sse2_psrl_d; + case Intrinsic::x86_mmx_psrli_w: + NewIntNo = Intrinsic::x86_mmx_psrl_w; break; - case Intrinsic::x86_sse2_psrli_q: - NewIntNo = Intrinsic::x86_sse2_psrl_q; + case Intrinsic::x86_mmx_psrli_d: + NewIntNo = Intrinsic::x86_mmx_psrl_d; break; - case Intrinsic::x86_sse2_psrai_w: - NewIntNo = Intrinsic::x86_sse2_psra_w; + case Intrinsic::x86_mmx_psrli_q: + NewIntNo = Intrinsic::x86_mmx_psrl_q; break; - case Intrinsic::x86_sse2_psrai_d: - NewIntNo = Intrinsic::x86_sse2_psra_d; + case Intrinsic::x86_mmx_psrai_w: + NewIntNo = Intrinsic::x86_mmx_psra_w; break; - default: { - ShAmtVT = MVT::v2i32; - switch (IntNo) { - case Intrinsic::x86_mmx_pslli_w: - NewIntNo = Intrinsic::x86_mmx_psll_w; - break; - case Intrinsic::x86_mmx_pslli_d: - NewIntNo = Intrinsic::x86_mmx_psll_d; - break; - case Intrinsic::x86_mmx_pslli_q: - NewIntNo = Intrinsic::x86_mmx_psll_q; - break; - case Intrinsic::x86_mmx_psrli_w: - NewIntNo = Intrinsic::x86_mmx_psrl_w; - break; - case Intrinsic::x86_mmx_psrli_d: - NewIntNo = Intrinsic::x86_mmx_psrl_d; - break; - case Intrinsic::x86_mmx_psrli_q: - NewIntNo = Intrinsic::x86_mmx_psrl_q; - break; - case Intrinsic::x86_mmx_psrai_w: - NewIntNo = Intrinsic::x86_mmx_psra_w; - break; - case Intrinsic::x86_mmx_psrai_d: - NewIntNo = Intrinsic::x86_mmx_psra_d; - break; - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - } + case Intrinsic::x86_mmx_psrai_d: + NewIntNo = Intrinsic::x86_mmx_psra_d; break; - } + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. } // The vector shift intrinsics with scalars uses 32b shift amounts but // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits // to be zero. - SDValue ShOps[4]; - ShOps[0] = ShAmt; - ShOps[1] = DAG.getConstant(0, MVT::i32); - if (ShAmtVT == MVT::v4i32) { - ShOps[2] = DAG.getUNDEF(MVT::i32); - ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); - } else { - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt, + DAG.getConstant(0, MVT::i32)); // FIXME this must be lowered to get rid of the invalid type. - } EVT VT = Op.getValueType(); ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); @@ -9432,13 +9745,13 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameAddr, Offset), - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - RetAddrFI, MachinePointerInfo(), false, false, 0); + RetAddrFI, MachinePointerInfo(), false, false, false, 0); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { @@ -9453,7 +9766,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); return FrameAddr; } @@ -9685,7 +9998,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); // Transform as necessary SDValue CWD1 = @@ -9745,7 +10058,8 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { return Op; } -SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, + SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -9753,26 +10067,41 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { Op = Op.getOperand(0); if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } - // Issue a bsf (scan bits forward) which also sets EFLAGS. + // Issue a bsr (scan bits in reverse). SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // And xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + Op = Op.getOperand(0); + + // Issue a bsf (scan bits forward) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { Op, - DAG.getConstant(NumBits, OpVT), + DAG.getConstant(NumBits, VT), DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); - - if (VT == MVT::i8) - Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); - return Op; + return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); } // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit @@ -9824,49 +10153,49 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) return Lower256IntArith(Op, DAG); - assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && + "Only know how to lower V2I64/V4I64 multiply"); + DebugLoc dl = Op.getDebugLoc(); - // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); - // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); - // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); - // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); - // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); + // Ahi = psrlqi(a, 32); + // Bhi = psrlqi(b, 32); // - // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); - // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); + // AloBlo = pmuludq(a, b); + // AloBhi = pmuludq(a, Bhi); + // AhiBlo = pmuludq(Ahi, b); + + // AloBhi = psllqi(AloBhi, 32); + // AhiBlo = psllqi(AhiBlo, 32); // return AloBlo + AloBhi + AhiBlo; SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); - SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - A, DAG.getConstant(32, MVT::i32)); - SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - B, DAG.getConstant(32, MVT::i32)); - SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - A, B); - SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - A, Bhi); - SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - Ahi, B); - AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - AloBhi, DAG.getConstant(32, MVT::i32)); - AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - AhiBlo, DAG.getConstant(32, MVT::i32)); + SDValue ShAmt = DAG.getConstant(32, MVT::i32); + + SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); + SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); + + // Bit cast to 32-bit vectors for MULUDQ + EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; + A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); + B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); + Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); + Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); + + SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); + SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + + AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); + AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); + SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); - Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); - return Res; + return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { @@ -9877,12 +10206,183 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { SDValue Amt = Op.getOperand(1); LLVMContext *Context = DAG.getContext(); - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) return SDValue(); + // Optimize shl/srl/sra with constant shift amount. + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + uint64_t ShiftAmt = C->getZExtValue(); + + if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + (Subtarget->hasAVX2() && + (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { + if (Op.getOpcode() == ISD::SHL) + return DAG.getNode(X86ISD::VSHLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + if (Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VSRLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) + return DAG.getNode(X86ISD::VSRAI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + } + + if (VT == MVT::v16i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); + // Zero out the rightmost bits. + SmallVector<SDValue, 16> V(16, + DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); + // Zero out the leftmost bits. + SmallVector<SDValue, 16> V(16, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + } + + if (Subtarget->hasAVX2() && VT == MVT::v32i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); + // Zero out the rightmost bits. + SmallVector<SDValue, 32> V(32, + DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); + // Zero out the leftmost bits. + SmallVector<SDValue, 32> V(32, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + } + } + } + + // Lower SHL with variable shift amount. + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { + Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), + DAG.getConstant(23, MVT::i32)); + + const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; + Constant *C = ConstantDataVector::get(*Context, CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); + return DAG.getNode(ISD::MUL, dl, VT, Op, R); + } + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { + assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); + + // a = a << 5; + Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), + DAG.getConstant(5, MVT::i32)); + Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); + + // Turn 'a' into a mask suitable for VSELECT + SDValue VSelM = DAG.getConstant(0x80, VT); + SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + SDValue CM1 = DAG.getConstant(0x0f, VT); + SDValue CM2 = DAG.getConstant(0x3f, VT); + + // r = VSELECT(r, psllw(r & (char16)15, 4), a); + SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); + M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, + DAG.getConstant(4, MVT::i32), DAG); + M = DAG.getNode(ISD::BITCAST, dl, VT, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); + + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + // r = VSELECT(r, psllw(r & (char16)63, 2), a); + M = DAG.getNode(ISD::AND, dl, VT, R, CM2); + M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, + DAG.getConstant(2, MVT::i32), DAG); + M = DAG.getNode(ISD::BITCAST, dl, VT, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); + + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + // return VSELECT(r, r+r, a); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, + DAG.getNode(ISD::ADD, dl, VT, R, R), R); + return R; + } + // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.getSizeInBits() == 256) { - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); @@ -9897,9 +10397,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // Constant shift amount SmallVector<SDValue, 4> Amt1Csts; SmallVector<SDValue, 4> Amt2Csts; - for (int i = 0; i < NumElems/2; ++i) + for (unsigned i = 0; i != NumElems/2; ++i) Amt1Csts.push_back(Amt->getOperand(i)); - for (int i = NumElems/2; i < NumElems; ++i) + for (unsigned i = NumElems/2; i != NumElems; ++i) Amt2Csts.push_back(Amt->getOperand(i)); Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, @@ -9921,120 +10421,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); } - // Optimize shl/srl/sra with constant shift amount. - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { - uint64_t ShiftAmt = C->getZExtValue(); - - if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - } - } - - // Lower SHL with variable shift amount. - if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { - Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - Op.getOperand(1), DAG.getConstant(23, MVT::i32)); - - ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); - - std::vector<Constant*> CV(4, CI); - Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, 16); - - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); - Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); - Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); - return DAG.getNode(ISD::MUL, dl, VT, Op, R); - } - if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { - // a = a << 5; - Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - Op.getOperand(1), DAG.getConstant(5, MVT::i32)); - - ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); - ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); - - std::vector<Constant*> CVM1(16, CM1); - std::vector<Constant*> CVM2(16, CM2); - Constant *C = ConstantVector::get(CVM1); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, 16); - - // r = pblendv(r, psllw(r & (char16)15, 4), a); - M = DAG.getNode(ISD::AND, dl, VT, R, M); - M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, - DAG.getConstant(4, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); - // a += a - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - - C = ConstantVector::get(CVM2); - CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, 16); - - // r = pblendv(r, psllw(r & (char16)63, 2), a); - M = DAG.getNode(ISD::AND, dl, VT, R, M); - M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, - DAG.getConstant(2, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); - // a += a - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - - // return pblendv(r, r+r, a); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, - R, DAG.getNode(ISD::ADD, dl, VT, R, R)); - return R; - } return SDValue(); } @@ -10113,46 +10499,58 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ +SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - SDNode* Node = Op.getNode(); - EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); - EVT VT = Node->getValueType(0); - if (Subtarget->hasXMMInt() && VT.isVector()) { - unsigned BitsDiff = VT.getScalarType().getSizeInBits() - - ExtraVT.getScalarType().getSizeInBits(); - SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); - - unsigned SHLIntrinsicsID = 0; - unsigned SRAIntrinsicsID = 0; - switch (VT.getSimpleVT().SimpleTy) { - default: - return SDValue(); - case MVT::v4i32: { - SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; - SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; - break; - } - case MVT::v8i16: { - SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; - SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; - break; - } - } + EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + EVT VT = Op.getValueType(); - SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(SHLIntrinsicsID, MVT::i32), - Node->getOperand(0), ShAmt); + if (!Subtarget->hasSSE2() || !VT.isVector()) + return SDValue(); - // In case of 1 bit sext, no need to shr - if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; + unsigned BitsDiff = VT.getScalarType().getSizeInBits() - + ExtraVT.getScalarType().getSizeInBits(); + SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(SRAIntrinsicsID, MVT::i32), - Tmp1, ShAmt); + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v8i32: + case MVT::v16i16: + if (!Subtarget->hasAVX()) + return SDValue(); + if (!Subtarget->hasAVX2()) { + // needs to be split + int NumElems = VT.getVectorNumElements(); + SDValue Idx0 = DAG.getConstant(0, MVT::i32); + SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + int ExtraNumElems = ExtraVT.getVectorNumElements(); + ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, + ExtraNumElems/2); + SDValue Extra = DAG.getValueType(ExtraVT); + + LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); + LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; + } + // fall through + case MVT::v4i32: + case MVT::v8i16: { + SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, + Op.getOperand(0), ShAmt, DAG); + return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); + } } - - return SDValue(); } @@ -10161,7 +10559,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. // There isn't any reason to disable it if the target processor supports it. - if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) { + if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { SDValue Chain = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { @@ -10215,7 +10613,7 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for // no-sse2). There isn't any reason to disable it if the target processor // supports it. - if (Subtarget->hasXMMInt() || Subtarget->is64Bit()) + if (Subtarget->hasSSE2() || Subtarget->is64Bit()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -10246,8 +10644,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = 0; unsigned size = 0; switch(T.getSimpleVT().SimpleTy) { - default: - assert(false && "Invalid value type!"); + default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; @@ -10295,7 +10692,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); - assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() && + assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && @@ -10365,7 +10762,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { - default: assert(0 && "Invalid code"); + default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = X86ISD::ADD; break; case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; case ISD::SUBC: Opc = X86ISD::SUB; break; @@ -10432,6 +10829,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SRA: @@ -10506,8 +10904,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DebugLoc dl = N->getDebugLoc(); switch (N->getOpcode()) { default: - assert(false && "Do not know how to custom type legalize this operation!"); - return; + llvm_unreachable("Do not know how to custom type legalize this operation!"); case ISD::SIGN_EXTEND_INREG: case ISD::ADDC: case ISD::ADDE: @@ -10515,15 +10912,25 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SUBE: // We don't want to expand or promote these. return; - case ISD::FP_TO_SINT: { + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: { + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + + if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) + return; + std::pair<SDValue,SDValue> Vals = - FP_TO_INTHelper(SDValue(N, 0), DAG, true); + FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; if (FIST.getNode() != 0) { EVT VT = N->getValueType(0); // Return a load from the stack slot. - Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, - MachinePointerInfo(), false, false, 0)); + if (StackSlot.getNode() != 0) + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0)); + else + Results.push_back(FIST); } return; } @@ -10657,15 +11064,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; - case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; - case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; - case X86ISD::PSIGND: return "X86ISD::PSIGND"; + case X86ISD::PSIGN: return "X86ISD::PSIGN"; + case X86ISD::BLENDV: return "X86ISD::BLENDV"; + case X86ISD::BLENDPW: return "X86ISD::BLENDPW"; + case X86ISD::BLENDPS: return "X86ISD::BLENDPS"; + case X86ISD::BLENDPD: return "X86ISD::BLENDPD"; + case X86ISD::HADD: return "X86ISD::HADD"; + case X86ISD::HSUB: return "X86ISD::HSUB"; + case X86ISD::FHADD: return "X86ISD::FHADD"; + case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; - case X86ISD::FHADD: return "X86ISD::FHADD"; - case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; @@ -10681,18 +11092,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; + case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; case X86ISD::VSRL: return "X86ISD::VSRL"; - case X86ISD::CMPPD: return "X86ISD::CMPPD"; - case X86ISD::CMPPS: return "X86ISD::CMPPS"; - case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; - case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; - case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; - case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; - case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; - case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; - case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; - case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; + case X86ISD::VSRA: return "X86ISD::VSRA"; + case X86ISD::VSHLI: return "X86ISD::VSHLI"; + case X86ISD::VSRLI: return "X86ISD::VSRLI"; + case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::CMPP: return "X86ISD::CMPP"; + case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; + case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; @@ -10705,54 +11115,39 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; case X86ISD::ANDN: return "X86ISD::ANDN"; + case X86ISD::BLSI: return "X86ISD::BLSI"; + case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; + case X86ISD::BLSR: return "X86ISD::BLSR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::PALIGN: return "X86ISD::PALIGN"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; - case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; - case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; - case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; - case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; + case X86ISD::SHUFP: return "X86ISD::SHUFP"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; - case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; - case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; - case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; - case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; - case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; - case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; - case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; - case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; - case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; - case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; - case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; - case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; - case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; - case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; - case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; - case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; + case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; + case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; - case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; - case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; - case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; - case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; - case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; + case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; + case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; + case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; } } @@ -10855,21 +11250,21 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const { // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) - return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()); + return false; // FIXME: pshufb, blends, shifts. return (VT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, VT) || - isSHUFPMask(M, VT) || + isSHUFPMask(M, VT, Subtarget->hasAVX()) || isPSHUFDMask(M, VT) || isPSHUFHWMask(M, VT) || isPSHUFLWMask(M, VT) || - isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) || - isUNPCKLMask(M, VT) || - isUNPCKHMask(M, VT) || - isUNPCKL_v_undef_Mask(M, VT) || - isUNPCKH_v_undef_Mask(M, VT)); + isPALIGNRMask(M, VT, Subtarget) || + isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || + isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || + isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) || + isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2())); } bool @@ -10882,8 +11277,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, if (NumElts == 4 && VT.getSizeInBits() == 128) { return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || - isSHUFPMask(Mask, VT) || - isCommutedSHUFPMask(Mask, VT)); + isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || + isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true)); } return false; } @@ -10902,7 +11297,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, unsigned CXchgOpc, unsigned notOpc, unsigned EAXreg, - TargetRegisterClass *RC, + const TargetRegisterClass *RC, bool invSrc) const { // For the atomic bitwise operator, we generate // thisMBB: @@ -11274,7 +11669,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock * X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, unsigned numArgs, bool memArg) const { - assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && + assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); DebugLoc dl = MI->getDebugLoc(); @@ -11679,6 +12074,42 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( return EndMBB; } +// The EFLAGS operand of SelectItr might be missing a kill marker +// because there were multiple uses of EFLAGS, and ISel didn't know +// which to mark. Figure out whether SelectItr should have had a +// kill marker, and set it if it should. Returns the correct kill +// marker value. +static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, + MachineBasicBlock* BB, + const TargetRegisterInfo* TRI) { + // Scan forward through BB for a use/def of EFLAGS. + MachineBasicBlock::iterator miI(llvm::next(SelectItr)); + for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(X86::EFLAGS)) + return false; + if (mi.definesRegister(X86::EFLAGS)) + break; // Should have kill-flag - update below. + } + + // If we hit the end of the block, check whether EFLAGS is live into a + // successor. + if (miI == BB->end()) { + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(X86::EFLAGS)) + return false; + } + } + + // We found a def, or hit the end of the basic block and EFLAGS wasn't live + // out. SelectMI should have a kill flag on EFLAGS. + SelectItr->addRegisterKilled(X86::EFLAGS, TRI); + return true; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -11708,7 +12139,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - if (!MI->killsRegister(X86::EFLAGS)) { + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + if (!MI->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } @@ -11753,7 +12186,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - assert(EnableSegmentedStacks); + assert(getTargetMachine().Options.EnableSegmentedStacks); unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; @@ -11785,6 +12218,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), + SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), physSPReg = Is64Bit ? X86::RSP : X86::ESP; @@ -11802,33 +12236,39 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg) + BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) - .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg) - .addReg(tmpSPVReg); + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) + .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) - .addReg(tmpSPVReg); + .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) - .addReg(tmpSPVReg); + .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. + const uint32_t *RegMask = + getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI); + .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI) + .addRegMask(RegMask) + .addReg(X86::RAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space"); + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) @@ -11926,6 +12366,11 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); assert(MI->getOperand(3).isGlobal() && "This should be a global"); + // Get a register mask for the lowered call. + // FIXME: The 32-bit calls have non-standard calling conventions. Use a + // proper register mask. + const uint32_t *RegMask = + getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -11936,6 +12381,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); + MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) @@ -11946,6 +12392,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) @@ -11956,6 +12403,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } MI->eraseFromParent(); // The pseudo instruction is gone now. @@ -11966,30 +12414,14 @@ MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { - default: assert(0 && "Unexpected instr type to insert"); + default: llvm_unreachable("Unexpected instr type to insert"); case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: - assert(0 && "TAILJMP64 would not be touched here."); + llvm_unreachable("TAILJMP64 would not be touched here."); case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: - // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. - // On AMD64, additional defs should be added before register allocation. - if (!Subtarget->isTargetWin64()) { - MI->addRegisterDefined(X86::RSI); - MI->addRegisterDefined(X86::RDI); - MI->addRegisterDefined(X86::XMM6); - MI->addRegisterDefined(X86::XMM7); - MI->addRegisterDefined(X86::XMM8); - MI->addRegisterDefined(X86::XMM9); - MI->addRegisterDefined(X86::XMM10); - MI->addRegisterDefined(X86::XMM11); - MI->addRegisterDefined(X86::XMM12); - MI->addRegisterDefined(X86::XMM13); - MI->addRegisterDefined(X86::XMM14); - MI->addRegisterDefined(X86::XMM15); - } return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); @@ -12294,11 +12726,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, //===----------------------------------------------------------------------===// void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, - const APInt &Mask, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { + unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || @@ -12307,7 +12739,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); - KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. + KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. switch (Opc) { default: break; case X86ISD::ADD: @@ -12326,8 +12758,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, break; // Fallthrough case X86ISD::SETCC: - KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), - Mask.getBitWidth() - 1); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::INTRINSIC_WO_CHAIN: { unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -12339,18 +12770,20 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, case Intrinsic::x86_sse2_movmsk_pd: case Intrinsic::x86_avx_movmsk_pd_256: case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse2_pmovmskb_128: { + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx2_pmovmskb: { // High bits of movmskp{s|d}, pmovmskb are known zero. switch (IntId) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; + case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; } - KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(), - Mask.getBitWidth() - NumLoBits); + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); break; } } @@ -12418,7 +12851,8 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget* Subtarget) { DebugLoc dl = N->getDebugLoc(); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); @@ -12454,9 +12888,23 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) return SDValue(); + // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { + SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); + SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, + Ld->getMemoryVT(), + Ld->getPointerInfo(), + Ld->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + } + // Emit a zeroed vector and insert the desired subvector on its // first half. - SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), DAG.getConstant(0, MVT::i32), DAG, dl); return DCI.CombineTo(N, InsV); @@ -12501,7 +12949,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // Combine 256-bit vector shuffles. This is only profitable when in AVX mode if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE) - return PerformShuffleCombine256(N, DAG, DCI); + return PerformShuffleCombine256(N, DAG, DCI, Subtarget); // Only handle 128 wide vector from here on. if (VT.getSizeInBits() != 128) @@ -12517,11 +12965,185 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); } + +/// PerformTruncateCombine - Converts truncate operation to +/// a sequence of vector shuffle operations. +/// It is possible when we truncate 256-bit vector to 128-bit vector + +SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + DebugLoc dl = N->getDebugLoc(); + + if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(2)); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); + + // PSHUFD + int ShufMask1[] = {0, 2, 0, 0}; + + OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), + ShufMask1); + OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), + ShufMask1); + + // MOVLHPS + int ShufMask2[] = {0, 1, 4, 5}; + + return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); + } + if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, + DAG.getIntPtrConstant(4)); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); + + // PSHUFB + int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1}; + + OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, + DAG.getUNDEF(MVT::v16i8), + ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, + DAG.getUNDEF(MVT::v16i8), + ShufMask1); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); + + // MOVLHPS + int ShufMask2[] = {0, 1, 4, 5}; + + SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); + } + + return SDValue(); +} + +/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target +/// specific shuffle of a load can be folded into a single element load. +/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but +/// shuffles have been customed lowered so we need to handle those here. +static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue InVec = N->getOperand(0); + SDValue EltNo = N->getOperand(1); + + if (!isa<ConstantSDNode>(EltNo)) + return SDValue(); + + EVT VT = InVec.getValueType(); + + bool HasShuffleIntoBitcast = false; + if (InVec.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + EVT BCVT = InVec.getOperand(0).getValueType(); + if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) + return SDValue(); + InVec = InVec.getOperand(0); + HasShuffleIntoBitcast = true; + } + + if (!isTargetShuffle(InVec.getOpcode())) + return SDValue(); + + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + + SmallVector<int, 16> ShuffleMask; + bool UnaryShuffle; + if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle)) + return SDValue(); + + // Select the input vector, guarding against out of range extract vector. + unsigned NumElems = VT.getVectorNumElements(); + int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) + : InVec.getOperand(1); + + // If inputs to shuffle are the same for both ops, then allow 2 uses + unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + + if (LdNode.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) + return SDValue(); + + AllowedUses = 1; // only allow 1 load use if we have a bitcast + LdNode = LdNode.getOperand(0); + } + + if (!ISD::isNormalLoad(LdNode.getNode())) + return SDValue(); + + LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); + + if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) + return SDValue(); + + if (HasShuffleIntoBitcast) { + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NewAlign = TLI.getTargetData()-> + getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) + return SDValue(); + } + + // All checks match so transform back to vector_shuffle so that DAG combiner + // can finish the job + DebugLoc dl = N->getDebugLoc(); + + // Create shuffle node taking into account the case that its a unary shuffle + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); + Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, + InVec.getOperand(0), Shuffle, + &ShuffleMask[0]); + Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, + EltNo); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// to a simple store and scalar loads to extract the elements. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, - const TargetLowering &TLI) { + TargetLowering::DAGCombinerInfo &DCI) { + SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); + if (NewOp.getNode()) + return NewOp; + SDValue InputVector = N->getOperand(0); // Only operate on vectors of 4 elements, where the alternative shuffling @@ -12582,6 +13204,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, unsigned EltSize = InputVector.getValueType().getVectorElementType().getSizeInBits()/8; uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), @@ -12590,7 +13213,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Load the scalar. SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); // Replace the exact with the load. DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); @@ -12603,7 +13226,10 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + + DebugLoc DL = N->getDebugLoc(); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. @@ -12617,7 +13243,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && - (Subtarget->hasXMMInt() || + (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -12632,7 +13258,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); @@ -12642,7 +13268,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMIN; @@ -12660,7 +13286,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMAX; @@ -12670,7 +13296,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); @@ -12696,7 +13322,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; @@ -12706,7 +13332,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; Opcode = X86ISD::FMIN; @@ -12731,7 +13357,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; @@ -12848,6 +13474,57 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // Canonicalize max and min: + // (x > y) ? x : y -> (x >= y) ? x : y + // (x < y) ? x : y -> (x <= y) ? x : y + // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates + // the need for an extra compare + // against zero. e.g. + // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 + // subl %esi, %edi + // testl %edi, %edi + // movl $0, %eax + // cmovgl %edi, %eax + // => + // xorl %eax, %eax + // subl %esi, $edi + // cmovsl %eax, %edi + if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && + DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGT: { + ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; + Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), + Cond.getOperand(0), Cond.getOperand(1), NewCC); + return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + } + } + } + + // If we know that this node is legal then we know that it is going to be + // matched by one of the SSE/AVX BLEND instructions. These instructions only + // depend on the highest bit in each word. Try to use SimplifyDemandedBits + // to simplify previous instructions. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && + !DCI.isBeforeLegalize() && + TLI.isOperationLegal(ISD::VSELECT, VT)) { + unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + return SDValue(); } @@ -13042,7 +13719,8 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. - if (N1C && N0.getOpcode() == ISD::AND && + if (VT.isInteger() && !VT.isVector() && + N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY || @@ -13058,26 +13736,46 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { } } + + // Hardware support for vector shifts is sparse which makes us scalarize the + // vector operations in many cases. Also, on sandybridge ADD is faster than + // shl. + // (shl V, 1) -> add V,V + if (isSplatVector(N1.getNode())) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1C && (1 == N1C->getZExtValue())) { + return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); + } + } + return SDValue(); } /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts /// when possible. static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (!VT.isVector() && VT.isInteger() && - N->getOpcode() == ISD::SHL) - return PerformSHLCombine(N, DAG); + if (N->getOpcode() == ISD::SHL) { + SDValue V = PerformSHLCombine(N, DAG); + if (V.getNode()) return V; + } // On X86 with SSE2 support, we can transform this to a vector shift if // all elements are shifted by the same amount. We can't do this in legalize // because the a constant vector is typically transformed to a constant pool // so we have no knowledge of the shift amount. - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) return SDValue(); - if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + (!Subtarget->hasAVX2() || + (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) return SDValue(); SDValue ShAmtOp = N->getOperand(1); @@ -13093,6 +13791,11 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, BaseShAmt = Arg; break; } + // Handle the case where the build_vector is all undef + // FIXME: Should DAG allow this? + if (i == NumElts) + return SDValue(); + for (; i != NumElts; ++i) { SDValue Arg = ShAmtOp.getOperand(i); if (Arg.getOpcode() == ISD::UNDEF) continue; @@ -13119,9 +13822,16 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, BaseShAmt = InVec.getOperand(1); } } - if (BaseShAmt.getNode() == 0) + if (BaseShAmt.getNode() == 0) { + // Don't create instructions with illegal types after legalize + // types has run. + if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && + !DCI.isBeforeLegalize()) + return SDValue(); + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, DAG.getIntPtrConstant(0)); + } } else return SDValue(); @@ -13136,47 +13846,38 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift opcode!"); - break; case ISD::SHL: - if (VT == MVT::v2i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); + } case ISD::SRA: - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v4i32: + case MVT::v8i16: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); + } case ISD::SRL: - if (VT == MVT::v2i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); + } } - return SDValue(); } @@ -13190,7 +13891,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. - if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); @@ -13300,7 +14001,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); - // Create ANDN instructions + // Create ANDN, BLSI, and BLSR instructions + // BLSI is X & (-X) + // BLSR is X & (X-1) if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -13313,6 +14016,26 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); + // Check LHS for neg + if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && + isZero(N0.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N1); + + // Check RHS for neg + if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && + isZero(N1.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N0); + + // Check LHS for X-1 + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N1); + + // Check RHS for X-1 + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N0); + return SDValue(); } @@ -13353,98 +14076,87 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return R; EVT VT = N->getValueType(0); - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) - return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // look for psign/blend - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { - if (VT == MVT::v2i64) { - // Canonicalize pandn to RHS - if (N0.getOpcode() == X86ISD::ANDNP) - std::swap(N0, N1); - // or (and (m, x), (pandn m, y)) - if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { - SDValue Mask = N1.getOperand(0); - SDValue X = N1.getOperand(1); - SDValue Y; - if (N0.getOperand(0) == Mask) - Y = N0.getOperand(1); - if (N0.getOperand(1) == Mask) - Y = N0.getOperand(0); - - // Check to see if the mask appeared in both the AND and ANDNP and - if (!Y.getNode()) - return SDValue(); - - // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. - if (Mask.getOpcode() != ISD::BITCAST || - X.getOpcode() != ISD::BITCAST || - Y.getOpcode() != ISD::BITCAST) - return SDValue(); - - // Look through mask bitcast. - Mask = Mask.getOperand(0); - EVT MaskVT = Mask.getValueType(); - - // Validate that the Mask operand is a vector sra node. The sra node - // will be an intrinsic. - if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) - return SDValue(); - - // FIXME: what to do for bytes, since there is a psignb/pblendvb, but - // there is no psrai.b - switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - break; - default: return SDValue(); - } - - // Check that the SRA is all signbits. - SDValue SraC = Mask.getOperand(2); - unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); - unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); - if ((SraAmt + 1) != EltBits) - return SDValue(); + if (VT == MVT::v2i64 || VT == MVT::v4i64) { + if (!Subtarget->hasSSSE3() || + (VT == MVT::v4i64 && !Subtarget->hasAVX2())) + return SDValue(); - DebugLoc DL = N->getDebugLoc(); + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::ANDNP) + std::swap(N0, N1); + // or (and (m, y), (pandn m, x)) + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); + + // Check to see if the mask appeared in both the AND and ANDNP and + if (!Y.getNode()) + return SDValue(); - // Now we know we at least have a plendvb with the mask val. See if - // we can form a psignb/w/d. - // psign = x.type == y.type == mask.type && y = sub(0, x); + // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. + // Look through mask bitcast. + if (Mask.getOpcode() == ISD::BITCAST) + Mask = Mask.getOperand(0); + if (X.getOpcode() == ISD::BITCAST) X = X.getOperand(0); + if (Y.getOpcode() == ISD::BITCAST) Y = Y.getOperand(0); - if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && - ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && - X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ - unsigned Opc = 0; - switch (EltBits) { - case 8: Opc = X86ISD::PSIGNB; break; - case 16: Opc = X86ISD::PSIGNW; break; - case 32: Opc = X86ISD::PSIGND; break; - default: break; - } - if (Opc) { - SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); - } - } - // PBLENDVB only available on SSE 4.1 - if (!(Subtarget->hasSSE41() || Subtarget->hasAVX())) - return SDValue(); - - X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); - Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); - Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y); - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); + + EVT MaskVT = Mask.getValueType(); + + // Validate that the Mask operand is a vector sra node. + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + if (Mask.getOpcode() != X86ISD::VSRAI) + return SDValue(); + + // Check that the SRA is all signbits. + SDValue SraC = Mask.getOperand(1); + unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + if ((SraAmt + 1) != EltBits) + return SDValue(); + + DebugLoc DL = N->getDebugLoc(); + + // Now we know we at least have a plendvb with the mask val. See if + // we can form a psignb/w/d. + // psign = x.type == y.type == mask.type && y = sub(0, x); + if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && + ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && + X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Unsupported VT for PSIGN"); + Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } + // PBLENDVB only available on SSE 4.1 + if (!Subtarget->hasSSE41()) + return SDValue(); + + EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + + X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); + Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); + Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); + Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } } + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); @@ -13500,6 +14212,36 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes +static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + EVT VT = N->getValueType(0); + + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); + + // Create BLSMSK instructions by finding X ^ (X-1) + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); + + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); + + return SDValue(); +} + /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -13515,7 +14257,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // shuffle. We need SSE4 for the shuffles. // TODO: It is possible to support ZExt by zeroing the undef values // during the shuffle phase or after the shuffle. - if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { + if (RegVT.isVector() && RegVT.isInteger() && + Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); @@ -13553,7 +14296,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->getAlignment()); + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); // Insert the word loaded into a vector. SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, @@ -13561,7 +14305,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector); + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, + ScalarInVector); unsigned SizeRatio = RegSz/MemSz; // Redistribute the loaded elements into the different locations. @@ -13593,7 +14338,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we are saving a concatination of two XMM registers, perform two stores. + // If we are saving a concatenation of two XMM registers, perform two stores. // This is better in Sandy Bridge cause one 256-bit mem op is done via two // 128-bit ones. If in the future the cost becomes only one memory access the // first version would be better. @@ -13703,8 +14448,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); - bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps - && Subtarget->hasXMMInt(); + bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps + && Subtarget->hasSSE2(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa<LoadSDNode>(St->getValue()) && @@ -13722,7 +14467,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Ld = cast<LoadSDNode>(St->getChain()); else if (St->getValue().hasOneUse() && ChainVal->getOpcode() == ISD::TokenFactor) { - for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { if (ChainVal->getOperand(i).getNode() == LdVal) { TokenFactorIndex = i; Ld = cast<LoadSDNode>(St->getValue()); @@ -13749,7 +14494,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->getAlignment()); + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); @@ -13770,10 +14516,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), - Ld->getAlignment()); + Ld->isInvariant(), Ld->getAlignment()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), MinAlign(Ld->getAlignment(), 4)); SDValue NewChain = LoLd.getValue(1); @@ -13817,7 +14564,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > @@ -13833,7 +14580,18 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { return false; EVT VT = LHS.getValueType(); - unsigned N = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for horizontal add/sub"); + + // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to + // operate independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts / NumLanes; + assert((NumLaneElts % 2 == 0) && + "Vector type should have an even number of elements in each lane"); + unsigned HalfLaneElts = NumLaneElts/2; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask @@ -13842,34 +14600,36 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { // NOTE: in what follows a default initialized SDValue represents an UNDEF of // type VT. SDValue A, B; - SmallVector<int, 8> LMask(N); + SmallVector<int, 16> LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) A = LHS.getOperand(0); if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) B = LHS.getOperand(1); - cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), LMask.begin()); } else { if (LHS.getOpcode() != ISD::UNDEF) A = LHS; - for (unsigned i = 0; i != N; ++i) + for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; } // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector<int, 8> RMask(N); + SmallVector<int, 16> RMask(NumElts); if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) C = RHS.getOperand(0); if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) D = RHS.getOperand(1); - cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), RMask.begin()); } else { if (RHS.getOpcode() != ISD::UNDEF) C = RHS; - for (unsigned i = 0; i != N; ++i) + for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; } @@ -13884,30 +14644,28 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) - for (unsigned i = 0; i != N; ++i) { - unsigned Idx = RMask[i]; - if (Idx < N) - RMask[i] += N; - else if (Idx < 2*N) - RMask[i] -= N; - } + CommuteVectorShuffleMask(RMask, NumElts); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned i = 0; i != N; ++i) { - unsigned LIdx = LMask[i], RIdx = RMask[i]; + for (unsigned i = 0; i != NumElts; ++i) { + int LIdx = LMask[i], RIdx = RMask[i]; // Ignore any UNDEF components. - if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N)) - || (!B.getNode() && (LIdx >= N || RIdx >= N))) + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. - if (!(LIdx == 2*i && RIdx == 2*i + 1) && - !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i)) + unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs + unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; + int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; } @@ -13924,8 +14682,8 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. - if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && - (VT == MVT::v4f32 || VT == MVT::v2f64) && + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -13939,8 +14697,8 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal subs from subs of shuffles. - if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && - (VT == MVT::v4f32 || VT == MVT::v2f64) && + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -14006,7 +14764,58 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) + return SDValue(); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + DebugLoc dl = N->getDebugLoc(); + + if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || + (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { + + unsigned NumElems = OpVT.getVectorNumElements(); + SmallVector<int,8> ShufMask1(NumElems, -1); + for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i; + + SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), + ShufMask1.data()); + + SmallVector<int,8> ShufMask2(NumElems, -1); + for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2; + + SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), + ShufMask2.data()); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); + OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + } + return SDValue(); +} + +static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because @@ -14014,6 +14823,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { DebugLoc dl = N->getDebugLoc(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + EVT OpVT = N0.getValueType(); + if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { @@ -14028,6 +14839,37 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, VT)); } + // Optimize vectors in AVX mode: + // + // v8i16 -> v8i32 + // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. + // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. + // Concat upper and lower parts. + // + // v4i32 -> v4i64 + // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. + // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. + // Concat upper and lower parts. + // + if (Subtarget->hasAVX()) { + + if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || + ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { + + SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); + SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG); + SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG); + + EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + } + } + return SDValue(); } @@ -14136,7 +14978,24 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, OtherVal.getValueType()), NewCmp); } -static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { +/// PerformADDCombine - Do target-specific dag combines on integer adds. +static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); + + return OptimizeConditionalInDecrement(N, DAG); +} + +static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -14158,6 +15017,13 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { } } + // Try to synthesize horizontal adds from adds of shuffles. + EVT VT = N->getValueType(0); + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); + return OptimizeConditionalInDecrement(N, DAG); } @@ -14167,19 +15033,20 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: - return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); + return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: - case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); + case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); - case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); - case ISD::SUB: return PerformSubCombine(N, DAG); + case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); + case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: case ISD::SRA: - case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); + case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); + case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); @@ -14190,27 +15057,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); - case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); + case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); + case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); + case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); - case X86ISD::SHUFPS: // Handle all target specific shuffles - case X86ISD::SHUFPD: + case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: + case X86ISD::UNPCKH: + case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFD: @@ -14218,11 +15072,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: - case X86ISD::VPERM2F128: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); } @@ -14330,11 +15181,38 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// +namespace { + // Helper to match a string separated by whitespace. + bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { + s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. + + for (unsigned i = 0, e = args.size(); i != e; ++i) { + StringRef piece(*args[i]); + if (!s.startswith(piece)) // Check if the piece matches. + return false; + + s = s.substr(piece.size()); + StringRef::size_type pos = s.find_first_not_of(" \t"); + if (pos == 0) // We matched a prefix. + return false; + + s = s.substr(pos); + } + + return s.empty(); + } + const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; +} + bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector<StringRef, 4> AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); @@ -14342,35 +15220,27 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { switch (AsmPieces.size()) { default: return false; case 1: - AsmStr = AsmPieces[0]; - AsmPieces.clear(); - SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. - // FIXME: this should verify that we are targeting a 486 or better. If not, - // we will turn this bswap into something that will be lowered to logical ops - // instead of emitting the bswap asm. For now, we don't support 486 or lower - // so don't worry about this. + // we will turn this bswap into something that will be lowered to logical + // ops instead of emitting the bswap asm. For now, we don't support 486 or + // lower so don't worry about this. // bswap $0 - if (AsmPieces.size() == 2 && - (AsmPieces[0] == "bswap" || - AsmPieces[0] == "bswapq" || - AsmPieces[0] == "bswapl") && - (AsmPieces[1] == "$0" || - AsmPieces[1] == "${0:q}")) { + if (matchAsm(AsmPieces[0], "bswap", "$0") || + matchAsm(AsmPieces[0], "bswapl", "$0") || + matchAsm(AsmPieces[0], "bswapq", "$0") || + matchAsm(AsmPieces[0], "bswap", "${0:q}") || + matchAsm(AsmPieces[0], "bswapl", "${0:q}") || + matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; return IntrinsicLowering::LowerToByteSwap(CI); } + // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && - AsmPieces.size() == 3 && - (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && - AsmPieces[1] == "$$8," && - AsmPieces[2] == "${0:w}" && - IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || + matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -14379,46 +15249,26 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } + AsmPieces[3] == "~{fpsr}") + return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: if (CI->getType()->isIntegerTy(32) && - IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { - SmallVector<StringRef, 4> Words; - SplitString(AsmPieces[0], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && - Words[2] == "${0:w}") { - Words.clear(); - SplitString(AsmPieces[1], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && - Words[2] == "$0") { - Words.clear(); - SplitString(AsmPieces[2], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && - Words[2] == "${0:w}") { - AsmPieces.clear(); - const std::string &ConstraintsStr = IA->getConstraintString(); - SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - std::sort(AsmPieces.begin(), AsmPieces.end()); - if (AsmPieces.size() == 4 && - AsmPieces[0] == "~{cc}" && - AsmPieces[1] == "~{dirflag}" && - AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } - } - } - } + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && + matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && + matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { + AsmPieces.clear(); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + std::sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") + return IntrinsicLowering::LowerToByteSwap(CI); } if (CI->getType()->isIntegerTy(64)) { @@ -14427,23 +15277,10 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - SmallVector<StringRef, 4> Words; - SplitString(AsmPieces[0], Words, " \t"); - if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { - Words.clear(); - SplitString(AsmPieces[1], Words, " \t"); - if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { - Words.clear(); - SplitString(AsmPieces[2], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && - Words[2] == "%edx") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } - } - } + if (matchAsm(AsmPieces[0], "bswap", "%eax") && + matchAsm(AsmPieces[1], "bswap", "%edx") && + matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) + return IntrinsicLowering::LowerToByteSwap(CI); } } break; @@ -14538,7 +15375,8 @@ TargetLowering::ConstraintWeight break; case 'x': case 'Y': - if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) + if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || + ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX())) weight = CW_Register; break; case 'I': @@ -14608,9 +15446,9 @@ LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { - if (Subtarget->hasXMMInt()) + if (Subtarget->hasSSE2()) return "Y"; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) return "x"; } @@ -14816,10 +15654,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (!Subtarget->hasMMX()) break; return std::make_pair(0U, X86::VR64RegisterClass); case 'Y': // SSE_REGS if SSE2 allowed - if (!Subtarget->hasXMMInt()) break; + if (!Subtarget->hasSSE2()) break; // FALL THROUGH. - case 'x': // SSE_REGS if SSE1 allowed - if (!Subtarget->hasXMM()) break; + case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed + if (!Subtarget->hasSSE1()) break; switch (VT.getSimpleVT().SimpleTy) { default: break; @@ -14838,6 +15676,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v4f32: case MVT::v2f64: return std::make_pair(0U, X86::VR128RegisterClass); + // AVX types. + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + case MVT::v4i64: + case MVT::v8f32: + case MVT::v4f64: + return std::make_pair(0U, X86::VR256RegisterClass); + } break; } diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index 342a5e6..4e00733 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -172,12 +172,23 @@ namespace llvm { /// ANDNP - Bitwise Logical AND NOT of Packed FP values. ANDNP, - /// PSIGNB/W/D - Copy integer sign. - PSIGNB, PSIGNW, PSIGND, + /// PSIGN - Copy integer sign. + PSIGN, - /// BLEND family of opcodes + /// BLENDV - Blend where the selector is an XMM. BLENDV, + /// BLENDxx - Blend where the selector is an immediate. + BLENDPW, + BLENDPS, + BLENDPD, + + /// HADD - Integer horizontal add. + HADD, + + /// HSUB - Integer horizontal sub. + HSUB, + /// FHADD - Floating point horizontal add. FHADD, @@ -213,16 +224,26 @@ namespace llvm { // VZEXT_MOVL - Vector move low and zero extend. VZEXT_MOVL, - // VSHL, VSRL - Vector logical left / right shift. - VSHL, VSRL, + // VSEXT_MOVL - Vector move low and sign extend. + VSEXT_MOVL, + + // VSHL, VSRL - 128-bit vector logical left / right shift + VSHLDQ, VSRLDQ, + + // VSHL, VSRL, VSRA - Vector shift elements + VSHL, VSRL, VSRA, - // CMPPD, CMPPS - Vector double/float comparison. - // CMPPD, CMPPS - Vector double/float comparison. - CMPPD, CMPPS, + // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate + VSHLI, VSRLI, VSRAI, + + // CMPP - Vector packed double/float comparison. + CMPP, // PCMP* - Vector integer comparisons. - PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ, - PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ, + PCMPEQ, PCMPGT, + + // VPCOM, VPCOMU - XOP Vector integer comparisons. + VPCOM, VPCOMU, // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, @@ -230,6 +251,10 @@ namespace llvm { ANDN, // ANDN - Bitwise AND NOT with FLAGS results. + BLSI, // BLSI - Extract lowest set isolated bit + BLSMSK, // BLSMSK - Get mask up to lowest set bit + BLSR, // BLSR - Reset lowest set bit + UMUL, // LOW, HI, FLAGS = umul LHS, RHS // MUL_IMM - X86 specific multiply by immediate. @@ -246,46 +271,26 @@ namespace llvm { PSHUFD, PSHUFHW, PSHUFLW, - PSHUFHW_LD, - PSHUFLW_LD, - SHUFPD, - SHUFPS, + SHUFP, MOVDDUP, MOVSHDUP, MOVSLDUP, - MOVSHDUP_LD, - MOVSLDUP_LD, MOVLHPS, MOVLHPD, MOVHLPS, - MOVHLPD, MOVLPS, MOVLPD, MOVSD, MOVSS, - UNPCKLPS, - UNPCKLPD, - VUNPCKLPSY, - VUNPCKLPDY, - UNPCKHPS, - UNPCKHPD, - VUNPCKHPSY, - VUNPCKHPDY, - PUNPCKLBW, - PUNPCKLWD, - PUNPCKLDQ, - PUNPCKLQDQ, - PUNPCKHBW, - PUNPCKHWD, - PUNPCKHDQ, - PUNPCKHQDQ, - VPERMILPS, - VPERMILPSY, - VPERMILPD, - VPERMILPDY, - VPERM2F128, + UNPCKL, + UNPCKH, + VPERMILP, + VPERM2X128, VBROADCAST, + // PMULUDQ - Vector multiply packed unsigned doubleword integers + PMULUDQ, + // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, // according to %al. An operator is needed so that this can be expanded // with control flow. @@ -299,6 +304,9 @@ namespace llvm { // falls back to heap allocation if not. SEG_ALLOCA, + // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui. + WIN_FTOL, + // Memory barrier MEMBARRIER, MFENCE, @@ -368,75 +376,6 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace X86 { - /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to PSHUFD. - bool isPSHUFDMask(ShuffleVectorSDNode *N); - - /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to PSHUFD. - bool isPSHUFHWMask(ShuffleVectorSDNode *N); - - /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to PSHUFD. - bool isPSHUFLWMask(ShuffleVectorSDNode *N); - - /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to SHUFP*. - bool isSHUFPMask(ShuffleVectorSDNode *N); - - /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to MOVHLPS. - bool isMOVHLPSMask(ShuffleVectorSDNode *N); - - /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form - /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, - /// <2, 3, 2, 3> - bool isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N); - - /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for MOVLP{S|D}. - bool isMOVLPMask(ShuffleVectorSDNode *N); - - /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for MOVHP{S|D}. - /// as well as MOVLHPS. - bool isMOVLHPSMask(ShuffleVectorSDNode *N); - - /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to UNPCKL. - bool isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat = false); - - /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to UNPCKH. - bool isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat = false); - - /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form - /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, - /// <0, 0, 1, 1> - bool isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N); - - /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form - /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, - /// <2, 2, 3, 3> - bool isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N); - - /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to MOVSS, - /// MOVSD, and MOVD, i.e. setting the lowest element. - bool isMOVLMask(ShuffleVectorSDNode *N); - - /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. - bool isMOVSHDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget); - - /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. - bool isMOVSLDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget); - - /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand - /// specifies a shuffle of elements that is suitable for input to MOVDDUP. - bool isMOVDDUPMask(ShuffleVectorSDNode *N); - /// isVEXTRACTF128Index - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for input to VEXTRACTF128. @@ -447,23 +386,6 @@ namespace llvm { /// suitable for input to VINSERTF128. bool isVINSERTF128Index(SDNode *N); - /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle - /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* - /// instructions. - unsigned getShuffleSHUFImmediate(SDNode *N); - - /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle - /// the specified VECTOR_SHUFFLE mask with PSHUFHW instruction. - unsigned getShufflePSHUFHWImmediate(SDNode *N); - - /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle - /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction. - unsigned getShufflePSHUFLWImmediate(SDNode *N); - - /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle - /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. - unsigned getShufflePALIGNRImmediate(SDNode *N); - /// getExtractVEXTRACTF128Immediate - Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index /// with VEXTRACTF128 instructions. @@ -529,7 +451,7 @@ namespace llvm { /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If - /// 'NonScalarIntSafe' is true, that means it's safe to return a + /// 'IsZeroVal' is true, that means it's safe to return a /// non-scalar-integer type, e.g. empty string source, constant, or loaded /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is /// constant so it does not need to be loaded. @@ -537,7 +459,7 @@ namespace llvm { /// target-independent logic. virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool NonScalarIntSafe, bool MemcpyStrSrc, + bool IsZeroVal, bool MemcpyStrSrc, MachineFunction &MF) const; /// allowsUnalignedMemoryAccesses - Returns true if the target allows @@ -587,7 +509,6 @@ namespace llvm { /// in Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. virtual void computeMaskedBitsForTargetNode(const SDValue Op, - const APInt &Mask, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, @@ -697,6 +618,18 @@ namespace llvm { (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } + /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine + /// for fptoui. + bool isTargetFTOL() const { + return Subtarget->isTargetWindows() && !Subtarget->is64Bit(); + } + + /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be + /// used for fptoui to the given type. + bool isIntegerTypeFTOL(EVT VT) const { + return isTargetFTOL() && VT == MVT::i64; + } + /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const; @@ -779,7 +712,8 @@ namespace llvm { SelectionDAG &DAG) const; std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool isSigned) const; + bool isSigned, + bool isReplace) const; SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, SelectionDAG &DAG) const; @@ -833,6 +767,7 @@ namespace llvm { SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const; @@ -846,9 +781,12 @@ namespace llvm { SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue PerformTruncateCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const; // Utility functions to help LowerVECTOR_SHUFFLE SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const; + SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const; virtual SDValue LowerFormalArguments(SDValue Chain, @@ -857,8 +795,8 @@ namespace llvm { DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue - LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, bool &isTailCall, + LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, + bool isVarArg, bool doesNotRet, bool &isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -872,7 +810,7 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, SelectionDAG &DAG) const; - virtual bool isUsedByReturnOnly(SDNode *N) const; + virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const; virtual bool mayBeEmittedAsTailCall(CallInst *CI) const; @@ -916,7 +854,7 @@ namespace llvm { unsigned cxchgOpc, unsigned notOpc, unsigned EAXreg, - TargetRegisterClass *RC, + const TargetRegisterClass *RC, bool invSrc = false) const; MachineBasicBlock *EmitAtomicBit6432WithCustomInserter( diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td index dd4f6a5..54b91c3 100644 --- a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td +++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td @@ -1,4 +1,4 @@ -//====- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===// +//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td index 74b647a..0eee083 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -1,10 +1,10 @@ -//===- X86InstrArithmetic.td - Integer Arithmetic Instrs ---*- tablegen -*-===// -// +//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the integer arithmetic instructions in the X86 @@ -18,22 +18,24 @@ let neverHasSideEffects = 1 in def LEA16r : I<0x8D, MRMSrcMem, (outs GR16:$dst), (ins i32mem:$src), - "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize; + "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize; let isReMaterializable = 1 in def LEA32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "lea{l}\t{$src|$dst}, {$dst|$src}", - [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>; + [(set GR32:$dst, lea32addr:$src)], IIC_LEA>, + Requires<[In32BitMode]>; def LEA64_32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_32mem:$src), "lea{l}\t{$src|$dst}, {$dst|$src}", - [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>; + [(set GR32:$dst, lea32addr:$src)], IIC_LEA>, + Requires<[In64BitMode]>; let isReMaterializable = 1 in def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "lea{q}\t{$src|$dst}, {$dst|$src}", - [(set GR64:$dst, lea64addr:$src)]>; + [(set GR64:$dst, lea64addr:$src)], IIC_LEA>; @@ -51,21 +53,23 @@ def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src", // This probably ought to be moved to a def : Pat<> if the // syntax can be accepted. [(set AL, (mul AL, GR8:$src)), - (implicit EFLAGS)]>; // AL,AH = AL*GR8 + (implicit EFLAGS)], IIC_MUL8>; // AL,AH = AL*GR8 let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), "mul{w}\t$src", - []>, OpSize; // AX,DX = AX*GR16 + [], IIC_MUL16_REG>, OpSize; // AX,DX = AX*GR16 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src), "mul{l}\t$src", // EAX,EDX = EAX*GR32 - [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>; + [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/], + IIC_MUL32_REG>; let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src), "mul{q}\t$src", // RAX,RDX = RAX*GR64 - [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>; + [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/], + IIC_MUL64>; let Defs = [AL,EFLAGS,AX], Uses = [AL] in def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), @@ -74,50 +78,51 @@ def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), // This probably ought to be moved to a def : Pat<> if the // syntax can be accepted. [(set AL, (mul AL, (loadi8 addr:$src))), - (implicit EFLAGS)]>; // AL,AH = AL*[mem8] + (implicit EFLAGS)], IIC_MUL8>; // AL,AH = AL*[mem8] let mayLoad = 1, neverHasSideEffects = 1 in { let Defs = [AX,DX,EFLAGS], Uses = [AX] in def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src), "mul{w}\t$src", - []>, OpSize; // AX,DX = AX*[mem16] + [], IIC_MUL16_MEM>, OpSize; // AX,DX = AX*[mem16] let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src), "mul{l}\t$src", - []>; // EAX,EDX = EAX*[mem32] -let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in + [], IIC_MUL32_MEM>; // EAX,EDX = EAX*[mem32] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src), - "mul{q}\t$src", []>; // RAX,RDX = RAX*[mem64] + "mul{q}\t$src", [], IIC_MUL64>; // RAX,RDX = RAX*[mem64] } let neverHasSideEffects = 1 in { let Defs = [AL,EFLAGS,AX], Uses = [AL] in -def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", []>; - // AL,AH = AL*GR8 +def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [], + IIC_IMUL8>; // AL,AH = AL*GR8 let Defs = [AX,DX,EFLAGS], Uses = [AX] in -def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", []>, - OpSize; // AX,DX = AX*GR16 +def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", [], + IIC_IMUL16_RR>, OpSize; // AX,DX = AX*GR16 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in -def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", []>; - // EAX,EDX = EAX*GR32 -let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in -def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", []>; - // RAX,RDX = RAX*GR64 +def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", [], + IIC_IMUL32_RR>; // EAX,EDX = EAX*GR32 +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [], + IIC_IMUL64_RR>; // RAX,RDX = RAX*GR64 let mayLoad = 1 in { let Defs = [AL,EFLAGS,AX], Uses = [AL] in def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src), - "imul{b}\t$src", []>; // AL,AH = AL*[mem8] + "imul{b}\t$src", [], IIC_IMUL8>; // AL,AH = AL*[mem8] let Defs = [AX,DX,EFLAGS], Uses = [AX] in def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src), - "imul{w}\t$src", []>, OpSize; // AX,DX = AX*[mem16] + "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize; + // AX,DX = AX*[mem16] let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src), - "imul{l}\t$src", []>; // EAX,EDX = EAX*[mem32] -let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in + "imul{l}\t$src", [], IIC_IMUL32_MEM>; // EAX,EDX = EAX*[mem32] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src), - "imul{q}\t$src", []>; // RAX,RDX = RAX*[mem64] + "imul{q}\t$src", [], IIC_IMUL64>; // RAX,RDX = RAX*[mem64] } } // neverHasSideEffects @@ -130,16 +135,19 @@ let isCommutable = 1 in { // X = IMUL Y, Z --> X = IMUL Z, Y def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2), "imul{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, EFLAGS, - (X86smul_flag GR16:$src1, GR16:$src2))]>, TB, OpSize; + (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>, + TB, OpSize; def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2), "imul{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, EFLAGS, - (X86smul_flag GR32:$src1, GR32:$src2))]>, TB; + (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>, + TB; def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "imul{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, EFLAGS, - (X86smul_flag GR64:$src1, GR64:$src2))]>, TB; + (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>, + TB; } // Register-Memory Signed Integer Multiply @@ -147,18 +155,23 @@ def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), "imul{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, EFLAGS, - (X86smul_flag GR16:$src1, (load addr:$src2)))]>, + (X86smul_flag GR16:$src1, (load addr:$src2)))], + IIC_IMUL16_RM>, TB, OpSize; def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "imul{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, EFLAGS, - (X86smul_flag GR32:$src1, (load addr:$src2)))]>, TB; + (X86smul_flag GR32:$src1, (load addr:$src2)))], + IIC_IMUL32_RM>, + TB; def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "imul{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, EFLAGS, - (X86smul_flag GR64:$src1, (load addr:$src2)))]>, TB; + (X86smul_flag GR64:$src1, (load addr:$src2)))], + IIC_IMUL64_RM>, + TB; } // Constraints = "$src1 = $dst" } // Defs = [EFLAGS] @@ -170,33 +183,39 @@ def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR16:$dst, EFLAGS, - (X86smul_flag GR16:$src1, imm:$src2))]>, OpSize; + (X86smul_flag GR16:$src1, imm:$src2))], + IIC_IMUL16_RRI>, OpSize; def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR16:$dst, EFLAGS, - (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>, + (X86smul_flag GR16:$src1, i16immSExt8:$src2))], + IIC_IMUL16_RRI>, OpSize; def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, EFLAGS, - (X86smul_flag GR32:$src1, imm:$src2))]>; + (X86smul_flag GR32:$src1, imm:$src2))], + IIC_IMUL32_RRI>; def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, EFLAGS, - (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>; + (X86smul_flag GR32:$src1, i32immSExt8:$src2))], + IIC_IMUL32_RRI>; def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32 (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR64:$dst, EFLAGS, - (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>; + (X86smul_flag GR64:$src1, i64immSExt32:$src2))], + IIC_IMUL64_RRI>; def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR64:$dst, EFLAGS, - (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>; + (X86smul_flag GR64:$src1, i64immSExt8:$src2))], + IIC_IMUL64_RRI>; // Memory-Integer Signed Integer Multiply @@ -204,37 +223,43 @@ def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR16:$dst, EFLAGS, - (X86smul_flag (load addr:$src1), imm:$src2))]>, + (X86smul_flag (load addr:$src1), imm:$src2))], + IIC_IMUL16_RMI>, OpSize; def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2), "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR16:$dst, EFLAGS, (X86smul_flag (load addr:$src1), - i16immSExt8:$src2))]>, OpSize; + i16immSExt8:$src2))], IIC_IMUL16_RMI>, + OpSize; def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, EFLAGS, - (X86smul_flag (load addr:$src1), imm:$src2))]>; + (X86smul_flag (load addr:$src1), imm:$src2))], + IIC_IMUL32_RMI>; def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2), "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, EFLAGS, (X86smul_flag (load addr:$src1), - i32immSExt8:$src2))]>; + i32immSExt8:$src2))], + IIC_IMUL32_RMI>; def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32 (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR64:$dst, EFLAGS, (X86smul_flag (load addr:$src1), - i64immSExt32:$src2))]>; + i64immSExt32:$src2))], + IIC_IMUL64_RMI>; def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2), "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR64:$dst, EFLAGS, (X86smul_flag (load addr:$src1), - i64immSExt8:$src2))]>; + i64immSExt8:$src2))], + IIC_IMUL64_RMI>; } // Defs = [EFLAGS] @@ -243,62 +268,62 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 // unsigned division/remainder let Defs = [AL,EFLAGS,AX], Uses = [AX] in def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH - "div{b}\t$src", []>; + "div{b}\t$src", [], IIC_DIV8_REG>; let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX - "div{w}\t$src", []>, OpSize; + "div{w}\t$src", [], IIC_DIV16>, OpSize; let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX - "div{l}\t$src", []>; + "div{l}\t$src", [], IIC_DIV32>; // RDX:RAX/r64 = RAX,RDX let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src), - "div{q}\t$src", []>; + "div{q}\t$src", [], IIC_DIV64>; let mayLoad = 1 in { let Defs = [AL,EFLAGS,AX], Uses = [AX] in def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH - "div{b}\t$src", []>; + "div{b}\t$src", [], IIC_DIV8_MEM>; let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX - "div{w}\t$src", []>, OpSize; + "div{w}\t$src", [], IIC_DIV16>, OpSize; let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src), - "div{l}\t$src", []>; + "div{l}\t$src", [], IIC_DIV32>; // RDX:RAX/[mem64] = RAX,RDX let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), - "div{q}\t$src", []>; + "div{q}\t$src", [], IIC_DIV64>; } // Signed division/remainder. let Defs = [AL,EFLAGS,AX], Uses = [AX] in def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH - "idiv{b}\t$src", []>; + "idiv{b}\t$src", [], IIC_IDIV8>; let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX - "idiv{w}\t$src", []>, OpSize; + "idiv{w}\t$src", [], IIC_IDIV16>, OpSize; let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX - "idiv{l}\t$src", []>; + "idiv{l}\t$src", [], IIC_IDIV32>; // RDX:RAX/r64 = RAX,RDX let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src), - "idiv{q}\t$src", []>; - -let mayLoad = 1, mayLoad = 1 in { + "idiv{q}\t$src", [], IIC_IDIV64>; + +let mayLoad = 1 in { let Defs = [AL,EFLAGS,AX], Uses = [AX] in def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH - "idiv{b}\t$src", []>; + "idiv{b}\t$src", [], IIC_IDIV8>; let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX - "idiv{w}\t$src", []>, OpSize; + "idiv{w}\t$src", [], IIC_IDIV16>, OpSize; let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), - "idiv{l}\t$src", []>; + "idiv{l}\t$src", [], IIC_IDIV32>; let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src), - "idiv{q}\t$src", []>; + "idiv{q}\t$src", [], IIC_IDIV64>; } //===----------------------------------------------------------------------===// @@ -312,35 +337,35 @@ let Constraints = "$src1 = $dst" in { def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1), "neg{b}\t$dst", [(set GR8:$dst, (ineg GR8:$src1)), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_REG>; def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "neg{w}\t$dst", [(set GR16:$dst, (ineg GR16:$src1)), - (implicit EFLAGS)]>, OpSize; + (implicit EFLAGS)], IIC_UNARY_REG>, OpSize; def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "neg{l}\t$dst", [(set GR32:$dst, (ineg GR32:$src1)), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_REG>; def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst", [(set GR64:$dst, (ineg GR64:$src1)), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_REG>; } // Constraints = "$src1 = $dst" def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), "neg{b}\t$dst", [(store (ineg (loadi8 addr:$dst)), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_MEM>; def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), "neg{w}\t$dst", [(store (ineg (loadi16 addr:$dst)), addr:$dst), - (implicit EFLAGS)]>, OpSize; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize; def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), "neg{l}\t$dst", [(store (ineg (loadi32 addr:$dst)), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_MEM>; def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst", [(store (ineg (loadi64 addr:$dst)), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_MEM>; } // Defs = [EFLAGS] @@ -351,29 +376,30 @@ let Constraints = "$src1 = $dst" in { let AddedComplexity = 15 in { def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1), "not{b}\t$dst", - [(set GR8:$dst, (not GR8:$src1))]>; + [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>; def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "not{w}\t$dst", - [(set GR16:$dst, (not GR16:$src1))]>, OpSize; + [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize; def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1), "not{l}\t$dst", - [(set GR32:$dst, (not GR32:$src1))]>; + [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>; def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst", - [(set GR64:$dst, (not GR64:$src1))]>; + [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>; } } // Constraints = "$src1 = $dst" def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), "not{b}\t$dst", - [(store (not (loadi8 addr:$dst)), addr:$dst)]>; + [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), "not{w}\t$dst", - [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize; + [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>, + OpSize; def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), "not{l}\t$dst", - [(store (not (loadi32 addr:$dst)), addr:$dst)]>; + [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst", - [(store (not (loadi64 addr:$dst)), addr:$dst)]>; + [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; } // CodeSize // TODO: inc/dec is slow for P4, but fast for Pentium-M. @@ -382,19 +408,22 @@ let Constraints = "$src1 = $dst" in { let CodeSize = 2 in def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), "inc{b}\t$dst", - [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))]>; + [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))], + IIC_UNARY_REG>; let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), "inc{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>, + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>, OpSize, Requires<[In32BitMode]>; def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), "inc{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>, + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], + IIC_UNARY_REG>, Requires<[In32BitMode]>; def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst", - [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))]>; + [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))], + IIC_UNARY_REG>; } // isConvertibleToThreeAddress = 1, CodeSize = 1 @@ -403,19 +432,23 @@ let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can transform into LEA. def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), "inc{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>, + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], + IIC_UNARY_REG>, OpSize, Requires<[In64BitMode]>; def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), "inc{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>, + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], + IIC_UNARY_REG>, Requires<[In64BitMode]>; def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "dec{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>, + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], + IIC_UNARY_REG>, OpSize, Requires<[In64BitMode]>; def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "dec{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>, + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], + IIC_UNARY_REG>, Requires<[In64BitMode]>; } // isConvertibleToThreeAddress = 1, CodeSize = 2 @@ -424,37 +457,37 @@ def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), let CodeSize = 2 in { def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst", [(store (add (loadi8 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_MEM>; def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", [(store (add (loadi16 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)]>, + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize, Requires<[In32BitMode]>; def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", [(store (add (loadi32 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)]>, + (implicit EFLAGS)], IIC_UNARY_MEM>, Requires<[In32BitMode]>; def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", [(store (add (loadi64 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_MEM>; // These are duplicates of their 32-bit counterparts. Only needed so X86 knows // how to unfold them. // FIXME: What is this for?? def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", [(store (add (loadi16 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)]>, + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize, Requires<[In64BitMode]>; def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", [(store (add (loadi32 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)]>, + (implicit EFLAGS)], IIC_UNARY_MEM>, Requires<[In64BitMode]>; def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", [(store (add (loadi16 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)]>, + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize, Requires<[In64BitMode]>; def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", [(store (add (loadi32 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)]>, + (implicit EFLAGS)], IIC_UNARY_MEM>, Requires<[In64BitMode]>; } // CodeSize = 2 @@ -462,18 +495,22 @@ let Constraints = "$src1 = $dst" in { let CodeSize = 2 in def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "dec{b}\t$dst", - [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))]>; + [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))], + IIC_UNARY_REG>; let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), "dec{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>, + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], + IIC_UNARY_REG>, OpSize, Requires<[In32BitMode]>; def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), "dec{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>, + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], + IIC_UNARY_REG>, Requires<[In32BitMode]>; def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst", - [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))]>; + [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))], + IIC_UNARY_REG>; } // CodeSize = 2 } // Constraints = "$src1 = $dst" @@ -481,18 +518,18 @@ def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst", let CodeSize = 2 in { def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst", [(store (add (loadi8 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_MEM>; def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", [(store (add (loadi16 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)]>, + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize, Requires<[In32BitMode]>; def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", [(store (add (loadi32 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)]>, + (implicit EFLAGS)], IIC_UNARY_MEM>, Requires<[In32BitMode]>; def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", [(store (add (loadi64 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_UNARY_MEM>; } // CodeSize = 2 } // Defs = [EFLAGS] @@ -588,11 +625,13 @@ def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, /// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations) /// or 1 (for i16,i32,i64 operations). class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, - string mnemonic, string args, list<dag> pattern> + string mnemonic, string args, list<dag> pattern, + InstrItinClass itin = IIC_BIN_NONMEM> : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4}, opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode }, f, outs, ins, - !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> { + !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern, + itin> { // Infer instruction prefixes from type info. let hasOpSizePrefix = typeinfo.HasOpSizePrefix; @@ -601,10 +640,11 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, // BinOpRR - Instructions like "add reg, reg, reg". class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - dag outlist, list<dag> pattern, Format f = MRMDestReg> + dag outlist, list<dag> pattern, InstrItinClass itin, + Format f = MRMDestReg> : ITy<opcode, f, typeinfo, outlist, (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), - mnemonic, "{$src2, $src1|$src1, $src2}", pattern>; + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>; // BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has // just a regclass (no eflags) as a result. @@ -612,7 +652,8 @@ class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode> : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, - (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>; + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], + IIC_BIN_NONMEM>; // BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has // just a EFLAGS as a result. @@ -621,7 +662,7 @@ class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpRR<opcode, mnemonic, typeinfo, (outs), [(set EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], - f>; + IIC_BIN_NONMEM, f>; // BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has // both a regclass and EFLAGS as a result. @@ -629,7 +670,8 @@ class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode> : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, - (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>; + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], + IIC_BIN_NONMEM>; // BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has // both a regclass and EFLAGS as a result, and has EFLAGS as input. @@ -638,14 +680,14 @@ class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2, - EFLAGS))]>; + EFLAGS))], IIC_BIN_NONMEM>; // BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding). class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> : ITy<opcode, MRMSrcReg, typeinfo, (outs typeinfo.RegClass:$dst), (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), - mnemonic, "{$src2, $dst|$dst, $src2}", []> { + mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM> { // The disassembler should know about this, but not the asmparser. let isCodeGenOnly = 1; } @@ -654,7 +696,7 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> : ITy<opcode, MRMSrcReg, typeinfo, (outs), (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), - mnemonic, "{$src2, $src1|$src1, $src2}", []> { + mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM> { // The disassembler should know about this, but not the asmparser. let isCodeGenOnly = 1; } @@ -664,7 +706,7 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, dag outlist, list<dag> pattern> : ITy<opcode, MRMSrcMem, typeinfo, outlist, (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2), - mnemonic, "{$src2, $src1|$src1, $src2}", pattern>; + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>; // BinOpRM_R - Instructions like "add reg, reg, [mem]". class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, @@ -700,7 +742,7 @@ class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f, dag outlist, list<dag> pattern> : ITy<opcode, f, typeinfo, outlist, (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2), - mnemonic, "{$src2, $src1|$src1, $src2}", pattern> { + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM> { let ImmT = typeinfo.ImmEncoding; } @@ -724,7 +766,6 @@ class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; - // BinOpRI_RFF - Instructions like "adc reg, reg, imm". class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> @@ -738,7 +779,7 @@ class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f, dag outlist, list<dag> pattern> : ITy<opcode, f, typeinfo, outlist, (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2), - mnemonic, "{$src2, $src1|$src1, $src2}", pattern> { + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM> { let ImmT = Imm8; // Always 8-bit immediate. } @@ -776,7 +817,7 @@ class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, list<dag> pattern> : ITy<opcode, MRMDestMem, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src), - mnemonic, "{$src, $dst|$dst, $src}", pattern>; + mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>; // BinOpMR_RMW - Instructions like "add [mem], reg". class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, @@ -804,7 +845,7 @@ class BinOpMI<string mnemonic, X86TypeInfo typeinfo, Format f, list<dag> pattern, bits<8> opcode = 0x80> : ITy<opcode, f, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src), - mnemonic, "{$src, $dst|$dst, $src}", pattern> { + mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM> { let ImmT = typeinfo.ImmEncoding; } @@ -815,7 +856,6 @@ class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo, [(store (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src), addr:$dst), (implicit EFLAGS)]>; - // BinOpMI_RMW_FF - Instructions like "adc [mem], imm". class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> @@ -837,7 +877,7 @@ class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, Format f, list<dag> pattern> : ITy<0x82, f, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src), - mnemonic, "{$src, $dst|$dst, $src}", pattern> { + mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM> { let ImmT = Imm8; // Always 8-bit immediate. } @@ -1150,7 +1190,7 @@ let Defs = [EFLAGS] in { // register class is constrained to GR8_NOREX. let isPseudo = 1 in def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), - "", []>; + "", [], IIC_BIN_NONMEM>; } //===----------------------------------------------------------------------===// @@ -1160,14 +1200,39 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop, PatFrag ld_frag> { def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))]>; + [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))], + IIC_BIN_NONMEM>; def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, EFLAGS, - (X86andn_flag RC:$src1, (ld_frag addr:$src2)))]>; + (X86andn_flag RC:$src1, (ld_frag addr:$src2)))], IIC_BIN_MEM>; } let Predicates = [HasBMI], Defs = [EFLAGS] in { defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8, VEX_4V; defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W; } + +//===----------------------------------------------------------------------===// +// MULX Instruction +// +multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> { +let neverHasSideEffects = 1 in { + let isCommutable = 1 in + def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + [], IIC_MUL8>, T8XD, VEX_4V; + + let mayLoad = 1 in + def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + [], IIC_MUL8>, T8XD, VEX_4V; +} +} + +let Predicates = [HasBMI2] in { + let Uses = [EDX] in + defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>; + let Uses = [RDX] in + defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h index 0245e5c..fa1d676 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -27,7 +27,6 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/PseudoSourceValue.h" namespace llvm { diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 3a43b22..adeaf54 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -1,10 +1,10 @@ -//===- X86InstrCMovSetCC.td - Conditional Move and SetCC ---*- tablegen -*-===// -// +//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the X86 conditional move and set on condition @@ -21,17 +21,20 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), [(set GR16:$dst, - (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>,TB,OpSize; + (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))], + IIC_CMOV16_RR>,TB,OpSize; def #NAME#32rr : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), [(set GR32:$dst, - (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>, TB; + (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))], + IIC_CMOV32_RR>, TB; def #NAME#64rr :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), [(set GR64:$dst, - (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB; + (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))], + IIC_CMOV32_RR>, TB; } let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" in { @@ -39,17 +42,18 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), - CondNode, EFLAGS))]>, TB, OpSize; + CondNode, EFLAGS))], IIC_CMOV16_RM>, + TB, OpSize; def #NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), - CondNode, EFLAGS))]>, TB; + CondNode, EFLAGS))], IIC_CMOV32_RM>, TB; def #NAME#64rm :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), - CondNode, EFLAGS))]>, TB; + CondNode, EFLAGS))], IIC_CMOV32_RM>, TB; } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" } // end multiclass @@ -78,10 +82,12 @@ multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> { let Uses = [EFLAGS] in { def r : I<opc, MRM0r, (outs GR8:$dst), (ins), !strconcat(Mnemonic, "\t$dst"), - [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>, TB; + [(set GR8:$dst, (X86setcc OpNode, EFLAGS))], + IIC_SET_R>, TB; def m : I<opc, MRM0m, (outs), (ins i8mem:$dst), !strconcat(Mnemonic, "\t$dst"), - [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>, TB; + [(store (X86setcc OpNode, EFLAGS), addr:$dst)], + IIC_SET_M>, TB; } // Uses = [EFLAGS] } diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index 612b2fa..6f9e849 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -112,23 +112,39 @@ let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in // allocated by bumping the stack pointer. Otherwise memory is allocated from // the heap. -let Defs = [EAX, ESP, EFLAGS], Uses = [ESP, EAX] in +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), "# variable sized alloca for segmented stacks", [(set GR32:$dst, (X86SegAlloca GR32:$size))]>, Requires<[In32BitMode]>; -let Defs = [RAX, RSP, EFLAGS], Uses = [RSP, RAX] in +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), "# variable sized alloca for segmented stacks", [(set GR64:$dst, (X86SegAlloca GR64:$size))]>, Requires<[In64BitMode]>; - } +// The MSVC runtime contains an _ftol2 routine for converting floating-point +// to integer values. It has a strange calling convention: the input is +// popped from the x87 stack, and the return value is given in EDX:EAX. No +// other registers (aside from flags) are touched. +// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 +// variant is unnecessary. + +let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in { + def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), + "# win32 fptoui", + [(X86WinFTOL RFP32:$src)]>, + Requires<[In32BitMode]>; + def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src), + "# win32 fptoui", + [(X86WinFTOL RFP64:$src)]>, + Requires<[In32BitMode]>; +} //===----------------------------------------------------------------------===// // EH Pseudo Instructions @@ -137,7 +153,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in { def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr), "ret\t#eh_return, addr: $addr", - [(X86ehret GR32:$addr)]>; + [(X86ehret GR32:$addr)], IIC_RET>; } @@ -145,8 +161,26 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in { def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), "ret\t#eh_return, addr: $addr", - [(X86ehret GR64:$addr)]>; + [(X86ehret GR64:$addr)], IIC_RET>; + +} + +//===----------------------------------------------------------------------===// +// Pseudo instructions used by segmented stacks. +// +// This is lowered into a RET instruction by MCInstLower. We need +// this so that we don't have to have a MachineBasicBlock which ends +// with a RET and also has successors. +let isPseudo = 1 in { +def MORESTACK_RET: I<0, Pseudo, (outs), (ins), + "", []>; + +// This instruction is lowered to a RET followed by a MOV. The two +// instructions are not generated on a higher level since then the +// verifier sees a MachineBasicBlock ending with a non-terminator. +def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), + "", []>; } //===----------------------------------------------------------------------===// @@ -159,7 +193,7 @@ def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1 in { def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "", - [(set GR8:$dst, 0)]>; + [(set GR8:$dst, 0)], IIC_ALU_NONMEM>; // We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller // encoding and avoids a partial-register update sometimes, but doing so @@ -168,11 +202,11 @@ def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "", // to an MCInst. def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins), "", - [(set GR16:$dst, 0)]>, OpSize; + [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize; // FIXME: Set encoding to pseudo. def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "", - [(set GR32:$dst, 0)]>; + [(set GR32:$dst, 0)], IIC_ALU_NONMEM>; } // We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a @@ -184,7 +218,7 @@ def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "", let Defs = [EFLAGS], isCodeGenOnly=1, AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "", - [(set GR64:$dst, 0)]>; + [(set GR64:$dst, 0)], IIC_ALU_NONMEM>; // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however @@ -192,7 +226,8 @@ def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "", let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1 in def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src), - "", [(set GR64:$dst, i64immZExt32:$src)]>; + "", [(set GR64:$dst, i64immZExt32:$src)], + IIC_ALU_NONMEM>; // Use sbb to materialize carry bit. let Uses = [EFLAGS], Defs = [EFLAGS], isCodeGenOnly = 1 in { @@ -202,14 +237,18 @@ let Uses = [EFLAGS], Defs = [EFLAGS], isCodeGenOnly = 1 in { // FIXME: Change these to have encoding Pseudo when X86MCCodeEmitter replaces // X86CodeEmitter. def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins), "", - [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; + [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))], + IIC_ALU_NONMEM>; def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins), "", - [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>, + [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))], + IIC_ALU_NONMEM>, OpSize; def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins), "", - [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; + [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))], + IIC_ALU_NONMEM>; def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "", - [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; + [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))], + IIC_ALU_NONMEM>; } // isCodeGenOnly @@ -262,34 +301,67 @@ def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), // String Pseudo Instructions // let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { -def REP_MOVSB : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", - [(X86rep_movs i8)]>, REP; -def REP_MOVSW : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", - [(X86rep_movs i16)]>, REP, OpSize; -def REP_MOVSD : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", - [(X86rep_movs i32)]>, REP; +def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)], IIC_REP_MOVS>, REP, + Requires<[In32BitMode]>; +def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize, + Requires<[In32BitMode]>; +def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)], IIC_REP_MOVS>, REP, + Requires<[In32BitMode]>; } -let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in -def REP_MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", - [(X86rep_movs i64)]>, REP; - +let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in { +def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize, + Requires<[In64BitMode]>; +def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", + [(X86rep_movs i64)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +} // FIXME: Should use "(X86rep_stos AL)" as the pattern. -let Defs = [ECX,EDI], Uses = [AL,ECX,EDI], isCodeGenOnly = 1 in -def REP_STOSB : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", - [(X86rep_stos i8)]>, REP; -let Defs = [ECX,EDI], Uses = [AX,ECX,EDI], isCodeGenOnly = 1 in -def REP_STOSW : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", - [(X86rep_stos i16)]>, REP, OpSize; -let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI], isCodeGenOnly = 1 in -def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", - [(X86rep_stos i32)]>, REP; - -let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI], isCodeGenOnly = 1 in -def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", - [(X86rep_stos i64)]>, REP; +let Defs = [ECX,EDI], isCodeGenOnly = 1 in { + let Uses = [AL,ECX,EDI] in + def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)], IIC_REP_STOS>, REP, + Requires<[In32BitMode]>; + let Uses = [AX,ECX,EDI] in + def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize, + Requires<[In32BitMode]>; + let Uses = [EAX,ECX,EDI] in + def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)], IIC_REP_STOS>, REP, + Requires<[In32BitMode]>; +} +let Defs = [RCX,RDI], isCodeGenOnly = 1 in { + let Uses = [AL,RCX,RDI] in + def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; + let Uses = [AX,RCX,RDI] in + def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize, + Requires<[In64BitMode]>; + let Uses = [RAX,RCX,RDI] in + def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; + + let Uses = [RAX,RCX,RDI] in + def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", + [(X86rep_stos i64)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; +} //===----------------------------------------------------------------------===// // Thread Local Storage Instructions @@ -537,22 +609,13 @@ let isCodeGenOnly = 1, Defs = [EFLAGS] in def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), "lock\n\t" "or{l}\t{$zero, $dst|$dst, $zero}", - []>, Requires<[In32BitMode]>, LOCK; + [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK; let hasSideEffects = 1 in def Int_MemBarrier : I<0, Pseudo, (outs), (ins), "#MEMBARRIER", [(X86MemBarrier)]>; -// TODO: Get this to fold the constant into the instruction. -let hasSideEffects = 1, Defs = [ESP], isCodeGenOnly = 1 in -def Int_MemBarrierNoSSE64 : RI<0x09, MRM1r, (outs), (ins GR64:$zero), - "lock\n\t" - "or{q}\t{$zero, (%rsp)|(%rsp), $zero}", - [(X86MemBarrierNoSSE GR64:$zero)]>, - Requires<[In64BitMode]>, LOCK; - - // RegOpc corresponds to the mr version of the instruction // ImmOpc corresponds to the mi version of the instruction // ImmOpc8 corresponds to the mi8 version of the instruction @@ -566,72 +629,72 @@ def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), !strconcat("lock\n\t", mnemonic, "{b}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_NONMEM>, LOCK; def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), !strconcat("lock\n\t", mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), - []>, OpSize, LOCK; + [], IIC_ALU_NONMEM>, OpSize, LOCK; def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), !strconcat("lock\n\t", mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_NONMEM>, LOCK; def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), !strconcat("lock\n\t", mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_NONMEM>, LOCK; def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), !strconcat("lock\n\t", mnemonic, "{b}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_MEM>, LOCK; def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), !strconcat("lock\n\t", mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_MEM>, LOCK; def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), !strconcat("lock\n\t", mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_MEM>, LOCK; def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), !strconcat("lock\n\t", mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_MEM>, LOCK; def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), !strconcat("lock\n\t", mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_MEM>, LOCK; def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), !strconcat("lock\n\t", mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_MEM>, LOCK; def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), !strconcat("lock\n\t", mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), - []>, LOCK; + [], IIC_ALU_MEM>, LOCK; } @@ -648,29 +711,29 @@ let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "lock\n\t" - "inc{b}\t$dst", []>, LOCK; + "inc{b}\t$dst", [], IIC_UNARY_MEM>, LOCK; def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "lock\n\t" - "inc{w}\t$dst", []>, OpSize, LOCK; + "inc{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK; def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "lock\n\t" - "inc{l}\t$dst", []>, LOCK; + "inc{l}\t$dst", [], IIC_UNARY_MEM>, LOCK; def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "lock\n\t" - "inc{q}\t$dst", []>, LOCK; + "inc{q}\t$dst", [], IIC_UNARY_MEM>, LOCK; def LOCK_DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "lock\n\t" - "dec{b}\t$dst", []>, LOCK; + "dec{b}\t$dst", [], IIC_UNARY_MEM>, LOCK; def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "lock\n\t" - "dec{w}\t$dst", []>, OpSize, LOCK; + "dec{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK; def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "lock\n\t" - "dec{l}\t$dst", []>, LOCK; + "dec{l}\t$dst", [], IIC_UNARY_MEM>, LOCK; def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "lock\n\t" - "dec{q}\t$dst", []>, LOCK; + "dec{q}\t$dst", [], IIC_UNARY_MEM>, LOCK; } // Atomic compare and swap. @@ -679,42 +742,42 @@ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), "lock\n\t" "cmpxchg8b\t$ptr", - [(X86cas8 addr:$ptr)]>, TB, LOCK; + [(X86cas8 addr:$ptr)], IIC_CMPX_LOCK_8B>, TB, LOCK; let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], isCodeGenOnly = 1 in def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), "lock\n\t" "cmpxchg16b\t$ptr", - [(X86cas16 addr:$ptr)]>, TB, LOCK, + [(X86cas16 addr:$ptr)], IIC_CMPX_LOCK_16B>, TB, LOCK, Requires<[HasCmpxchg16b]>; let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in { def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap), "lock\n\t" "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}", - [(X86cas addr:$ptr, GR8:$swap, 1)]>, TB, LOCK; + [(X86cas addr:$ptr, GR8:$swap, 1)], IIC_CMPX_LOCK_8>, TB, LOCK; } let Defs = [AX, EFLAGS], Uses = [AX], isCodeGenOnly = 1 in { def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap), "lock\n\t" "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}", - [(X86cas addr:$ptr, GR16:$swap, 2)]>, TB, OpSize, LOCK; + [(X86cas addr:$ptr, GR16:$swap, 2)], IIC_CMPX_LOCK>, TB, OpSize, LOCK; } let Defs = [EAX, EFLAGS], Uses = [EAX], isCodeGenOnly = 1 in { def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap), "lock\n\t" "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}", - [(X86cas addr:$ptr, GR32:$swap, 4)]>, TB, LOCK; + [(X86cas addr:$ptr, GR32:$swap, 4)], IIC_CMPX_LOCK>, TB, LOCK; } let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in { def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap), "lock\n\t" "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}", - [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK; + [(X86cas addr:$ptr, GR64:$swap, 8)], IIC_CMPX_LOCK>, TB, LOCK; } // Atomic exchange and add @@ -722,22 +785,26 @@ let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in { def LXADD8 : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr), "lock\n\t" "xadd{b}\t{$val, $ptr|$ptr, $val}", - [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))]>, + [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))], + IIC_XADD_LOCK_MEM8>, TB, LOCK; def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr), "lock\n\t" "xadd{w}\t{$val, $ptr|$ptr, $val}", - [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))]>, + [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))], + IIC_XADD_LOCK_MEM>, TB, OpSize, LOCK; def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr), "lock\n\t" "xadd{l}\t{$val, $ptr|$ptr, $val}", - [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))]>, + [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))], + IIC_XADD_LOCK_MEM>, TB, LOCK; def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr), "lock\n\t" "xadd{q}\t{$val, $ptr|$ptr, $val}", - [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>, + [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))], + IIC_XADD_LOCK_MEM>, TB, LOCK; } @@ -936,14 +1003,9 @@ def : Pat<(load (i64 (X86Wrapper tglobaltlsaddr :$dst))), // Direct PC relative function call for small code model. 32-bit displacement // sign extended to 64-bit. def : Pat<(X86call (i64 tglobaladdr:$dst)), - (CALL64pcrel32 tglobaladdr:$dst)>, Requires<[NotWin64]>; + (CALL64pcrel32 tglobaladdr:$dst)>; def : Pat<(X86call (i64 texternalsym:$dst)), - (CALL64pcrel32 texternalsym:$dst)>, Requires<[NotWin64]>; - -def : Pat<(X86call (i64 tglobaladdr:$dst)), - (WINCALL64pcrel32 tglobaladdr:$dst)>, Requires<[IsWin64]>; -def : Pat<(X86call (i64 texternalsym:$dst)), - (WINCALL64pcrel32 texternalsym:$dst)>, Requires<[IsWin64]>; + (CALL64pcrel32 texternalsym:$dst)>; // tailcall stuff def : Pat<(X86tcret GR32_TC:$dst, imm:$off), @@ -1105,12 +1167,10 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1))) return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); - unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits(); - APInt Mask = APInt::getAllOnesValue(BitWidth); APInt KnownZero0, KnownOne0; - CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0); + CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0); APInt KnownZero1, KnownOne1; - CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0); + CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0); return (~KnownZero0 & ~KnownZero1) == 0; }]>; @@ -1440,58 +1500,62 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; +// Helper imms that check if a mask doesn't change significant shift bits. +def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>; +def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>; + // (shl x (and y, 31)) ==> (shl x, y) -def : Pat<(shl GR8:$src1, (and CL, 31)), +def : Pat<(shl GR8:$src1, (and CL, immShift32)), (SHL8rCL GR8:$src1)>; -def : Pat<(shl GR16:$src1, (and CL, 31)), +def : Pat<(shl GR16:$src1, (and CL, immShift32)), (SHL16rCL GR16:$src1)>; -def : Pat<(shl GR32:$src1, (and CL, 31)), +def : Pat<(shl GR32:$src1, (and CL, immShift32)), (SHL32rCL GR32:$src1)>; -def : Pat<(store (shl (loadi8 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (shl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), (SHL8mCL addr:$dst)>; -def : Pat<(store (shl (loadi16 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (shl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), (SHL16mCL addr:$dst)>; -def : Pat<(store (shl (loadi32 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (shl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), (SHL32mCL addr:$dst)>; -def : Pat<(srl GR8:$src1, (and CL, 31)), +def : Pat<(srl GR8:$src1, (and CL, immShift32)), (SHR8rCL GR8:$src1)>; -def : Pat<(srl GR16:$src1, (and CL, 31)), +def : Pat<(srl GR16:$src1, (and CL, immShift32)), (SHR16rCL GR16:$src1)>; -def : Pat<(srl GR32:$src1, (and CL, 31)), +def : Pat<(srl GR32:$src1, (and CL, immShift32)), (SHR32rCL GR32:$src1)>; -def : Pat<(store (srl (loadi8 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (srl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), (SHR8mCL addr:$dst)>; -def : Pat<(store (srl (loadi16 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (srl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), (SHR16mCL addr:$dst)>; -def : Pat<(store (srl (loadi32 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (srl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), (SHR32mCL addr:$dst)>; -def : Pat<(sra GR8:$src1, (and CL, 31)), +def : Pat<(sra GR8:$src1, (and CL, immShift32)), (SAR8rCL GR8:$src1)>; -def : Pat<(sra GR16:$src1, (and CL, 31)), +def : Pat<(sra GR16:$src1, (and CL, immShift32)), (SAR16rCL GR16:$src1)>; -def : Pat<(sra GR32:$src1, (and CL, 31)), +def : Pat<(sra GR32:$src1, (and CL, immShift32)), (SAR32rCL GR32:$src1)>; -def : Pat<(store (sra (loadi8 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (sra (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), (SAR8mCL addr:$dst)>; -def : Pat<(store (sra (loadi16 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (sra (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), (SAR16mCL addr:$dst)>; -def : Pat<(store (sra (loadi32 addr:$dst), (and CL, 31)), addr:$dst), +def : Pat<(store (sra (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), (SAR32mCL addr:$dst)>; // (shl x (and y, 63)) ==> (shl x, y) -def : Pat<(shl GR64:$src1, (and CL, 63)), +def : Pat<(shl GR64:$src1, (and CL, immShift64)), (SHL64rCL GR64:$src1)>; def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst), (SHL64mCL addr:$dst)>; -def : Pat<(srl GR64:$src1, (and CL, 63)), +def : Pat<(srl GR64:$src1, (and CL, immShift64)), (SHR64rCL GR64:$src1)>; def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst), (SHR64mCL addr:$dst)>; -def : Pat<(sra GR64:$src1, (and CL, 63)), +def : Pat<(sra GR64:$src1, (and CL, immShift64)), (SAR64rCL GR64:$src1)>; def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst), (SAR64mCL addr:$dst)>; @@ -1735,3 +1799,11 @@ def : Pat<(and GR64:$src1, i64immSExt8:$src2), (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; def : Pat<(and GR64:$src1, i64immSExt32:$src2), (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Bit scan instruction patterns to match explicit zero-undef behavior. +def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>; +def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>; +def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; +def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index c228a0ae..bf11fde 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -1,4 +1,4 @@ -//===- X86InstrControl.td - Control Flow Instructions ------*- tablegen -*-===// +//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -20,39 +20,47 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, FPForm = SpecialFP in { def RET : I <0xC3, RawFrm, (outs), (ins variable_ops), "ret", - [(X86retflag 0)]>; + [(X86retflag 0)], IIC_RET>; + def RETW : I <0xC3, RawFrm, (outs), (ins variable_ops), + "ret{w}", + [], IIC_RET>, OpSize; def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), "ret\t$amt", - [(X86retflag timm:$amt)]>; + [(X86retflag timm:$amt)], IIC_RET_IMM>; def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), - "retw\t$amt", - []>, OpSize; + "ret{w}\t$amt", + [], IIC_RET_IMM>, OpSize; def LRETL : I <0xCB, RawFrm, (outs), (ins), - "lretl", []>; + "{l}ret{l|f}", [], IIC_RET>; + def LRETW : I <0xCB, RawFrm, (outs), (ins), + "{l}ret{w|f}", [], IIC_RET>, OpSize; def LRETQ : RI <0xCB, RawFrm, (outs), (ins), - "lretq", []>; + "{l}ret{q|f}", [], IIC_RET>; def LRETI : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), - "lret\t$amt", []>; + "{l}ret{l|f}\t$amt", [], IIC_RET>; def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), - "lretw\t$amt", []>, OpSize; + "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize; } // Unconditional branches. let isBarrier = 1, isBranch = 1, isTerminator = 1 in { def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), - "jmp\t$dst", [(br bb:$dst)]>; + "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>; def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), - "jmp\t$dst", []>; + "jmp\t$dst", [], IIC_JMP_REL>; + // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious + // with JMP_1. def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst), - "jmp{q}\t$dst", []>; + "jmpq\t$dst", [], IIC_JMP_REL>; } // Conditional Branches. let isBranch = 1, isTerminator = 1, Uses = [EFLAGS] in { multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { - def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, []>; + def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [], + IIC_Jcc>; def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm, - [(X86brcond bb:$dst, Cond, EFLAGS)]>, TB; + [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB; } } @@ -74,61 +82,61 @@ defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; // jcx/jecx/jrcx instructions. -let isAsmParserOnly = 1, isBranch = 1, isTerminator = 1 in { +let isBranch = 1, isTerminator = 1 in { // These are the 32-bit versions of this instruction for the asmparser. In // 32-bit mode, the address size prefix is jcxz and the unprefixed version is // jecxz. let Uses = [CX] in def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jcxz\t$dst", []>, AdSize, Requires<[In32BitMode]>; + "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In32BitMode]>; let Uses = [ECX] in def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jecxz\t$dst", []>, Requires<[In32BitMode]>; + "jecxz\t$dst", [], IIC_JCXZ>, Requires<[In32BitMode]>; // J*CXZ instruction: 64-bit versions of this instruction for the asmparser. // In 64-bit mode, the address size prefix is jecxz and the unprefixed version // is jrcxz. let Uses = [ECX] in def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jecxz\t$dst", []>, AdSize, Requires<[In64BitMode]>; + "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>; let Uses = [RCX] in def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jrcxz\t$dst", []>, Requires<[In64BitMode]>; + "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>; } // Indirect branches let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst", - [(brind GR32:$dst)]>, Requires<[In32BitMode]>; + [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>; def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst", - [(brind (loadi32 addr:$dst))]>, Requires<[In32BitMode]>; + [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, Requires<[In32BitMode]>; def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst", - [(brind GR64:$dst)]>, Requires<[In64BitMode]>; + [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>; def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst", - [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>; + [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, Requires<[In64BitMode]>; def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), (ins i16imm:$off, i16imm:$seg), - "ljmp{w}\t{$seg, $off|$off, $seg}", []>, OpSize; + "ljmp{w}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>, OpSize; def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), (ins i32imm:$off, i16imm:$seg), - "ljmp{l}\t{$seg, $off|$off, $seg}", []>; + "ljmp{l}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>; def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst), - "ljmp{q}\t{*}$dst", []>; + "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>; def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), - "ljmp{w}\t{*}$dst", []>, OpSize; + "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize; def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst), - "ljmp{l}\t{*}$dst", []>; + "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>; } // Loop instructions -def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>; -def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>; -def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>; +def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>; +def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>; +def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>; //===----------------------------------------------------------------------===// // Call Instructions... @@ -138,32 +146,30 @@ let isCall = 1 in // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. Uses for argument // registers are added manually. - let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, - MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, - XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], - Uses = [ESP] in { + let Uses = [ESP] in { def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, (outs), (ins i32imm_pcrel:$dst,variable_ops), - "call{l}\t$dst", []>, Requires<[In32BitMode]>; + "call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>; def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops), - "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, + "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, Requires<[In32BitMode]>; def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops), - "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>, + "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>, Requires<[In32BitMode]>; def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), (ins i16imm:$off, i16imm:$seg), - "lcall{w}\t{$seg, $off|$off, $seg}", []>, OpSize; + "lcall{w}\t{$seg, $off|$off, $seg}", [], + IIC_CALL_FAR_PTR>, OpSize; def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), (ins i32imm:$off, i16imm:$seg), - "lcall{l}\t{$seg, $off|$off, $seg}", []>; + "lcall{l}\t{$seg, $off|$off, $seg}", [], + IIC_CALL_FAR_PTR>; def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst), - "lcall{w}\t{*}$dst", []>, OpSize; + "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize; def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst), - "lcall{l}\t{*}$dst", []>; + "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>; // callw for 16 bit code for the assembler. let isAsmParserOnly = 1 in @@ -177,11 +183,7 @@ let isCall = 1 in let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1 in - let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, - MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, - XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], - Uses = [ESP] in { + let Uses = [ESP] in { def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>; def TCRETURNri : PseudoI<(outs), @@ -194,74 +196,43 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, // mcinst. def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), (ins i32imm_pcrel:$dst, variable_ops), - "jmp\t$dst # TAILCALL", - []>; + "jmp\t$dst # TAILCALL", + [], IIC_JMP_REL>; def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops), - "", []>; // FIXME: Remove encoding when JIT is dead. + "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. let mayLoad = 1 in def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops), - "jmp{l}\t{*}$dst # TAILCALL", []>; + "jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; } //===----------------------------------------------------------------------===// // Call Instructions... // -let isCall = 1 in - // All calls clobber the non-callee saved registers. RSP is marked as - // a use to prevent stack-pointer assignments that appear immediately - // before calls from potentially appearing dead. Uses for argument - // registers are added manually. - let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, - FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, - MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, - XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], - Uses = [RSP] in { - - // NOTE: this pattern doesn't match "X86call imm", because we do not know - // that the offset between an arbitrary immediate and the call will fit in - // the 32-bit pcrel field that we have. - def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i64i32imm_pcrel:$dst, variable_ops), - "call{q}\t$dst", []>, - Requires<[In64BitMode, NotWin64]>; - def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops), - "call{q}\t{*}$dst", [(X86call GR64:$dst)]>, - Requires<[In64BitMode, NotWin64]>; - def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops), - "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>, - Requires<[In64BitMode, NotWin64]>; - def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), - "lcall{q}\t{*}$dst", []>; - } +// RSP is marked as a use to prevent stack-pointer assignments that appear +// immediately before calls from potentially appearing dead. Uses for argument +// registers are added manually. +let isCall = 1, Uses = [RSP] in { + // NOTE: this pattern doesn't match "X86call imm", because we do not know + // that the offset between an arbitrary immediate and the call will fit in + // the 32-bit pcrel field that we have. + def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst, variable_ops), + "call{q}\t$dst", [], IIC_CALL_RI>, + Requires<[In64BitMode]>; + def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops), + "call{q}\t{*}$dst", [(X86call GR64:$dst)], + IIC_CALL_RI>, + Requires<[In64BitMode]>; + def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops), + "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], + IIC_CALL_MEM>, + Requires<[In64BitMode]>; - // FIXME: We need to teach codegen about single list of call-clobbered - // registers. -let isCall = 1, isCodeGenOnly = 1 in - // All calls clobber the non-callee saved registers. RSP is marked as - // a use to prevent stack-pointer assignments that appear immediately - // before calls from potentially appearing dead. Uses for argument - // registers are added manually. - let Defs = [RAX, RCX, RDX, R8, R9, R10, R11, - FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, - MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, - XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS], - Uses = [RSP] in { - def WINCALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i64i32imm_pcrel:$dst, variable_ops), - "call{q}\t$dst", []>, - Requires<[IsWin64]>; - def WINCALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops), - "call{q}\t{*}$dst", - [(X86call GR64:$dst)]>, Requires<[IsWin64]>; - def WINCALL64m : I<0xFF, MRM2m, (outs), - (ins i64mem:$dst,variable_ops), - "call{q}\t{*}$dst", - [(X86call (loadi64 addr:$dst))]>, - Requires<[IsWin64]>; - } + def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), + "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; +} let isCall = 1, isCodeGenOnly = 1 in // __chkstk(MSVC): clobber R10, R11 and EFLAGS. @@ -270,18 +241,13 @@ let isCall = 1, isCodeGenOnly = 1 in Uses = [RSP] in { def W64ALLOCA : Ii32PCRel<0xE8, RawFrm, (outs), (ins i64i32imm_pcrel:$dst, variable_ops), - "call{q}\t$dst", []>, + "call{q}\t$dst", [], IIC_CALL_RI>, Requires<[IsWin64]>; } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1 in - // AMD64 cc clobbers RSI, RDI, XMM6-XMM15. - let Defs = [RAX, RCX, RDX, R8, R9, R10, R11, - FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, - MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, - XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS], - Uses = [RSP], + let Uses = [RSP], usesCustomInserter = 1 in { def TCRETURNdi64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops), @@ -294,11 +260,11 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst, variable_ops), - "jmp\t$dst # TAILCALL", []>; + "jmp\t$dst # TAILCALL", [], IIC_JMP_REL>; def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops), - "jmp{q}\t{*}$dst # TAILCALL", []>; + "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; let mayLoad = 1 in def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops), - "jmp{q}\t{*}$dst # TAILCALL", []>; + "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td index e62e6b7..0d5490a 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -1,10 +1,10 @@ -//===- X86InstrExtension.td - Sign and Zero Extensions -----*- tablegen -*-===// -// +//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the sign and zero extension operations. @@ -37,40 +37,47 @@ let neverHasSideEffects = 1 in { } + // Sign/Zero extenders def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), - "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>, + TB, OpSize; def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), - "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>, + TB, OpSize; def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (sext GR8:$src))]>, TB; + [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB; def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB; + [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB; def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), "movs{wl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (sext GR16:$src))]>, TB; + [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB; def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "movs{wl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB; + [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>, + TB; def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), - "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>, + TB, OpSize; def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), - "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>, + TB, OpSize; def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (zext GR8:$src))]>, TB; + [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB; def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB; + [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB; def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), "movz{wl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (zext GR16:$src))]>, TB; + [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB; def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "movz{wl|x}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB; + [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>, + TB; // These are the same as the regular MOVZX32rr8 and MOVZX32rm8 // except that they use GR32_NOREX for the output operand register class @@ -78,12 +85,12 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", - []>, TB; + [], IIC_MOVZX>, TB; let mayLoad = 1 in def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", - []>, TB; + [], IIC_MOVZX>, TB; // MOVSX64rr8 always has a REX prefix and it has an 8-bit register // operand, which makes it a rare instruction with an 8-bit register @@ -91,32 +98,38 @@ def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, // were generalized, this would require a special register class. def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), "movs{bq|x}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sext GR8:$src))]>, TB; + [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB; def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), "movs{bq|x}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB; + [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>, + TB; def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), "movs{wq|x}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sext GR16:$src))]>, TB; + [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB; def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), "movs{wq|x}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB; + [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>, + TB; def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), "movs{lq|xd}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sext GR32:$src))]>; + [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>; def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), "movs{lq|xd}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (sextloadi64i32 addr:$src))]>; + [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>; // movzbq and movzwq encodings for the disassembler def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), - "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB; + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB; def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), - "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB; + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB; def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), - "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB; + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB; def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), - "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB; + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB; // FIXME: These should be Pat patterns. let isCodeGenOnly = 1 in { @@ -124,15 +137,17 @@ let isCodeGenOnly = 1 in { // Use movzbl instead of movzbq when the destination is a register; it's // equivalent due to implicit zero-extending, and it has a smaller encoding. def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), - "", [(set GR64:$dst, (zext GR8:$src))]>, TB; + "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB; def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), - "", [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB; + "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>, + TB; // Use movzwl instead of movzwq when the destination is a register; it's // equivalent due to implicit zero-extending, and it has a smaller encoding. def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), - "", [(set GR64:$dst, (zext GR16:$src))]>, TB; + "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB; def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), - "", [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB; + "", [(set GR64:$dst, (zextloadi64i16 addr:$src))], + IIC_MOVZX>, TB; // There's no movzlq instruction, but movl can be used for this purpose, using // implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero @@ -142,10 +157,9 @@ def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), // necessarily all zero. In such cases, we fall back to these explicit zext // instructions. def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src), - "", [(set GR64:$dst, (zext GR32:$src))]>; + "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>; def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), - "", [(set GR64:$dst, (zextloadi64i32 addr:$src))]>; - - + "", [(set GR64:$dst, (zextloadi64i32 addr:$src))], + IIC_MOVZX>; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td index d868773..d57937b 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -1,4 +1,4 @@ -//====- X86InstrFMA.td - Describe the X86 Instruction Set --*- tablegen -*-===// +//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -15,7 +15,7 @@ // FMA3 - Intel 3 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// -multiclass fma_rm<bits<8> opc, string OpcodeStr> { +multiclass fma3p_rm<bits<8> opc, string OpcodeStr> { def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -34,27 +34,187 @@ multiclass fma_rm<bits<8> opc, string OpcodeStr> { []>; } -multiclass fma_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpcodeStr, string PackTy> { - defm r132 : fma_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>; - defm r213 : fma_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>; - defm r231 : fma_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>; +multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpcodeStr, string PackTy> { + defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>; + defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>; + defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>; } -let isAsmParserOnly = 1 in { - // Fused Multiply-Add - defm VFMADDPS : fma_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">; - defm VFMADDPD : fma_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W; - defm VFMADDSUBPS : fma_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">; - defm VFMADDSUBPD : fma_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W; - defm VFMSUBADDPS : fma_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">; - defm VFMSUBADDPD : fma_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W; - defm VFMSUBPS : fma_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">; - defm VFMSUBPD : fma_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W; +// Fused Multiply-Add +let ExeDomain = SSEPackedSingle in { + defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">; + defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">; + defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">; + defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">; +} + +let ExeDomain = SSEPackedDouble in { + defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W; + defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W; + defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W; + defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W; +} + +// Fused Negative Multiply-Add +let ExeDomain = SSEPackedSingle in { + defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">; + defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">; +} +let ExeDomain = SSEPackedDouble in { + defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W; + defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W; +} + +multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> { + def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>; + def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>; +} - // Fused Negative Multiply-Add - defm VFNMADDPS : fma_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">; - defm VFNMADDPD : fma_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W; - defm VFNMSUBPS : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">; - defm VFNMSUBPD : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W; +multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpcodeStr> { + defm SSr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132ss"), f32mem>; + defm SSr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213ss"), f32mem>; + defm SSr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231ss"), f32mem>; + defm SDr132 : fma3s_rm<opc132, !strconcat(OpcodeStr, "132sd"), f64mem>, VEX_W; + defm SDr213 : fma3s_rm<opc213, !strconcat(OpcodeStr, "213sd"), f64mem>, VEX_W; + defm SDr231 : fma3s_rm<opc231, !strconcat(OpcodeStr, "231sd"), f64mem>, VEX_W; } + +defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd">, VEX_LIG; +defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub">, VEX_LIG; + +defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd">, VEX_LIG; +defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub">, VEX_LIG; + +//===----------------------------------------------------------------------===// +// FMA4 - AMD 4 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + + +multiclass fma4s<bits<8> opc, string OpcodeStr, Operand memop, + ComplexPattern mem_cpat, Intrinsic Int> { + def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; + def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, mem_cpat:$src3))]>, VEX_W, MemOp4; + def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, memop:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>; +// For disassembler +let isCodeGenOnly = 1 in + def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; +} + +multiclass fma4p<bits<8> opc, string OpcodeStr, + Intrinsic Int128, Intrinsic Int256, + PatFrag ld_frag128, PatFrag ld_frag256> { + def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; + def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, + (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4; + def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int128 VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_W, MemOp4; + def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, + (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4; + def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (Int256 VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>; +// For disassembler +let isCodeGenOnly = 1 in { + def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; + def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; +} // isCodeGenOnly = 1 +} + +defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma4_vfmadd_ss>; +defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma4_vfmadd_sd>; +defm VFMADDPS4 : fma4p<0x68, "vfmaddps", int_x86_fma4_vfmadd_ps, + int_x86_fma4_vfmadd_ps_256, memopv4f32, memopv8f32>; +defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", int_x86_fma4_vfmadd_pd, + int_x86_fma4_vfmadd_pd_256, memopv2f64, memopv4f64>; +defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma4_vfmsub_ss>; +defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma4_vfmsub_sd>; +defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", int_x86_fma4_vfmsub_ps, + int_x86_fma4_vfmsub_ps_256, memopv4f32, memopv8f32>; +defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", int_x86_fma4_vfmsub_pd, + int_x86_fma4_vfmsub_pd_256, memopv2f64, memopv4f64>; +defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma4_vfnmadd_ss>; +defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma4_vfnmadd_sd>; +defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", int_x86_fma4_vfnmadd_ps, + int_x86_fma4_vfnmadd_ps_256, memopv4f32, memopv8f32>; +defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", int_x86_fma4_vfnmadd_pd, + int_x86_fma4_vfnmadd_pd_256, memopv2f64, memopv4f64>; +defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma4_vfnmsub_ss>; +defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma4_vfnmsub_sd>; +defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", int_x86_fma4_vfnmsub_ps, + int_x86_fma4_vfnmsub_ps_256, memopv4f32, memopv8f32>; +defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", int_x86_fma4_vfnmsub_pd, + int_x86_fma4_vfnmsub_pd_256, memopv2f64, memopv4f64>; +defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma4_vfmaddsub_ps, + int_x86_fma4_vfmaddsub_ps_256, memopv4f32, memopv8f32>; +defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma4_vfmaddsub_pd, + int_x86_fma4_vfmaddsub_pd_256, memopv2f64, memopv4f64>; +defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma4_vfmsubadd_ps, + int_x86_fma4_vfmsubadd_ps_256, memopv4f32, memopv8f32>; +defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma4_vfmsubadd_pd, + int_x86_fma4_vfmsubadd_pd_256, memopv2f64, memopv4f64>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td index 7cb870f..a13887e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -1,10 +1,10 @@ -//==- X86InstrFPStack.td - Describe the X86 Instruction Set --*- tablegen -*-=// -// +//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the X86 x87 FPU instruction set, defining the @@ -225,22 +225,22 @@ class FPrST0PInst<bits<8> o, string asm> // of some of the 'reverse' forms of the fsub and fdiv instructions. As such, // we have to put some 'r's in and take them out of weird places. def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">; -def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, %ST(0)}">; +def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">; def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">; def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">; -def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, %ST(0)}">; +def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">; def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">; def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">; -def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, %ST(0)}">; +def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">; def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">; def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">; -def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, %ST(0)}">; +def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">; def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">; def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">; -def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, %ST(0)}">; +def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">; def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">; def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">; -def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, %ST(0)}">; +def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">; def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">; def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">; @@ -330,21 +330,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>; let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), - "fcmovb\t{$op, %st(0)|%ST(0), $op}">, DA; + "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA; def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), - "fcmovbe\t{$op, %st(0)|%ST(0), $op}">, DA; + "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA; def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), - "fcmove\t{$op, %st(0)|%ST(0), $op}">, DA; + "fcmove\t{$op, %st(0)|ST(0), $op}">, DA; def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), - "fcmovu\t {$op, %st(0)|%ST(0), $op}">, DA; + "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA; def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), - "fcmovnb\t{$op, %st(0)|%ST(0), $op}">, DB; + "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB; def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), - "fcmovnbe\t{$op, %st(0)|%ST(0), $op}">, DB; + "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB; def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), - "fcmovne\t{$op, %st(0)|%ST(0), $op}">, DB; + "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB; def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), - "fcmovnu\t{$op, %st(0)|%ST(0), $op}">, DB; + "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB; } // Predicates = [HasCMov] // Floating point loads & stores. @@ -437,33 +437,26 @@ def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">; } // FISTTP requires SSE3 even though it's a FPStack op. +let Predicates = [HasSSE3] in { def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, - [(X86fp_to_i16mem RFP32:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i16mem RFP32:$src, addr:$op)]>; def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, - [(X86fp_to_i32mem RFP32:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i32mem RFP32:$src, addr:$op)]>; def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, - [(X86fp_to_i64mem RFP32:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i64mem RFP32:$src, addr:$op)]>; def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, - [(X86fp_to_i16mem RFP64:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i16mem RFP64:$src, addr:$op)]>; def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, - [(X86fp_to_i32mem RFP64:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i32mem RFP64:$src, addr:$op)]>; def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, - [(X86fp_to_i64mem RFP64:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i64mem RFP64:$src, addr:$op)]>; def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, - [(X86fp_to_i16mem RFP80:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i16mem RFP80:$src, addr:$op)]>; def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, - [(X86fp_to_i32mem RFP80:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i32mem RFP80:$src, addr:$op)]>; def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, - [(X86fp_to_i64mem RFP80:$src, addr:$op)]>, - Requires<[HasSSE3]>; + [(X86fp_to_i64mem RFP80:$src, addr:$op)]>; +} // Predicates = [HasSSE3] let mayStore = 1 in { def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td index 0a1590b..b387090 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -1,10 +1,10 @@ -//===- X86InstrFormats.td - X86 Instruction Formats --------*- tablegen -*-===// -// +//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -43,6 +43,15 @@ def RawFrmImm8 : Format<43>; def RawFrmImm16 : Format<44>; def MRM_D0 : Format<45>; def MRM_D1 : Format<46>; +def MRM_D4 : Format<47>; +def MRM_D8 : Format<48>; +def MRM_D9 : Format<49>; +def MRM_DA : Format<50>; +def MRM_DB : Format<51>; +def MRM_DC : Format<52>; +def MRM_DD : Format<53>; +def MRM_DE : Format<54>; +def MRM_DF : Format<55>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -107,17 +116,25 @@ class T8 { bits<5> Prefix = 13; } class TA { bits<5> Prefix = 14; } class A6 { bits<5> Prefix = 15; } class A7 { bits<5> Prefix = 16; } -class TF { bits<5> Prefix = 17; } +class T8XD { bits<5> Prefix = 17; } +class T8XS { bits<5> Prefix = 18; } +class TAXD { bits<5> Prefix = 19; } +class XOP8 { bits<5> Prefix = 20; } +class XOP9 { bits<5> Prefix = 21; } class VEX { bit hasVEXPrefix = 1; } class VEX_W { bit hasVEX_WPrefix = 1; } class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; } +class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; } class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } - +class MemOp4 { bit hasMemOp4Prefix = 1; } +class XOP { bit hasXOP_Prefix = 1; } class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, - string AsmStr, Domain d = GenericDomain> + string AsmStr, + InstrItinClass itin, + Domain d = GenericDomain> : Instruction { let Namespace = "X86"; @@ -133,6 +150,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, // If this is a pseudo instruction, mark it isCodeGenOnly. let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); + let Itinerary = itin; + // // Attributes specific to X86 instructions... // @@ -148,11 +167,15 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasVEXPrefix = 0; // Does this inst require a VEX prefix? bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field? bit hasVEX_4VPrefix = 0; // Does this inst require the VEX.VVVV field? + bit hasVEX_4VOp3Prefix = 0; // Does this inst require the VEX.VVVV field to + // encode the third operand? bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register // to be encoded in a immediate field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? + bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands + bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix? // TSFlags layout should be kept in sync with X86InstrInfo.h. let TSFlags{5-0} = FormBits; @@ -169,58 +192,63 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{33} = hasVEXPrefix; let TSFlags{34} = hasVEX_WPrefix; let TSFlags{35} = hasVEX_4VPrefix; - let TSFlags{36} = hasVEX_i8ImmReg; - let TSFlags{37} = hasVEX_L; - let TSFlags{38} = ignoresVEX_L; - let TSFlags{39} = has3DNow0F0FOpcode; + let TSFlags{36} = hasVEX_4VOp3Prefix; + let TSFlags{37} = hasVEX_i8ImmReg; + let TSFlags{38} = hasVEX_L; + let TSFlags{39} = ignoresVEX_L; + let TSFlags{40} = has3DNow0F0FOpcode; + let TSFlags{41} = hasMemOp4Prefix; + let TSFlags{42} = hasXOP_Prefix; } class PseudoI<dag oops, dag iops, list<dag> pattern> - : X86Inst<0, Pseudo, NoImm, oops, iops, ""> { + : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> { let Pattern = pattern; } class I<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, Domain d = GenericDomain> - : X86Inst<o, f, NoImm, outs, ins, asm, d> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT, + Domain d = GenericDomain> + : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> { let Pattern = pattern; let CodeSize = 3; } class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern, Domain d = GenericDomain> - : X86Inst<o, f, Imm8, outs, ins, asm, d> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT, + Domain d = GenericDomain> + : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> { let Pattern = pattern; let CodeSize = 3; } class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> - : X86Inst<o, f, Imm8PCRel, outs, ins, asm> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> - : X86Inst<o, f, Imm16, outs, ins, asm> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : X86Inst<o, f, Imm16, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> - : X86Inst<o, f, Imm32, outs, ins, asm> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : X86Inst<o, f, Imm32, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> - : X86Inst<o, f, Imm16PCRel, outs, ins, asm> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> - : X86Inst<o, f, Imm32PCRel, outs, ins, asm> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } @@ -231,8 +259,9 @@ class FPI<bits<8> o, Format F, dag outs, dag ins, string asm> : I<o, F, outs, ins, asm, []> {} // FpI_ - Floating Point Pseudo Instruction template. Not Predicated. -class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern> - : X86Inst<0, Pseudo, NoImm, outs, ins, ""> { +class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, + InstrItinClass itin = IIC_DEFAULT> + : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> { let FPForm = fp; let Pattern = pattern; } @@ -244,20 +273,23 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern> // Iseg32 - 16-bit segment selector, 32-bit offset class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> : X86Inst<o, f, Imm16, outs, ins, asm> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : X86Inst<o, f, Imm16, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> : X86Inst<o, f, Imm32, outs, ins, asm> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : X86Inst<o, f, Imm32, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } // SI - SSE 1 & 2 scalar instructions -class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern> { +class SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2])); @@ -267,8 +299,8 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> // SIi8 - SSE 1 & 2 scalar instructions class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern> { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2])); @@ -278,8 +310,8 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, // PI - SSE 1 & 2 packed instructions class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, - Domain d> - : I<o, F, outs, ins, asm, pattern, d> { + InstrItinClass itin, Domain d> + : I<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1])); @@ -289,8 +321,8 @@ class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, // PIi8 - SSE 1 & 2 packed instructions with immediate class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, Domain d> - : Ii8<o, F, outs, ins, asm, pattern, d> { + list<dag> pattern, InstrItinClass itin, Domain d> + : Ii8<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX], !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1])); @@ -306,25 +338,27 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, // VSSI - SSE1 instructions with XS prefix in AVX form. // VPSI - SSE1 instructions with TB prefix in AVX form. -class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>; +class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>; class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>; -class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>; +class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, Requires<[HasSSE1]>; class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, Requires<[HasSSE1]>; class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, Requires<[HasAVX]>; class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, TB, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, TB, Requires<[HasAVX]>; // SSE2 Instruction Templates: @@ -337,28 +371,30 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, // VSDI - SSE2 instructions with XD prefix in AVX form. // VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form. -class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>; +class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; -class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, +class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[HasSSE2]>; class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[HasSSE2]>; class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD, Requires<[HasAVX]>; class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>, TB, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[HasAVX]>; // SSE3 Instruction Templates: @@ -368,15 +404,16 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, // S3DI - SSE3 instructions with XD prefix. class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS, Requires<[HasSSE3]>; class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD, Requires<[HasSSE3]>; -class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, +class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[HasSSE3]>; @@ -386,16 +423,16 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> // SS3AI - SSSE3 instructions with TA prefix. // // Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version -// uses the MMX registers. We put those instructions here because they better -// fit into the SSSE3 instruction category rather than the MMX category. +// uses the MMX registers. The 64-bit versions are grouped with the MMX +// classes. They need to be enabled even if AVX is enabled. class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[HasSSSE3]>; class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[HasSSSE3]>; // SSE4.1 Instruction Templates: @@ -404,31 +441,31 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, // SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8. // class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[HasSSE41]>; class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[HasSSE41]>; // SSE4.2 Instruction Templates: // // SS428I - SSE 4.2 instructions with T8 prefix. class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[HasSSE42]>; -// SS42FI - SSE 4.2 instructions with TF prefix. +// SS42FI - SSE 4.2 instructions with T8XD prefix. class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TF, Requires<[HasSSE42]>; - + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>; + // SS42AI = SSE 4.2 instructions with TA prefix class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, Requires<[HasSSE42]>; // AVX Instruction Templates: @@ -437,76 +474,115 @@ class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, // AVX8I - AVX instructions with T8 and OpSize prefix. // AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8. class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, OpSize, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, Requires<[HasAVX]>; class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, OpSize, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, Requires<[HasAVX]>; +// AVX2 Instruction Templates: +// Instructions introduced in AVX2 (no SSE equivalent forms) +// +// AVX28I - AVX2 instructions with T8 and OpSize prefix. +// AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8. +class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, + Requires<[HasAVX2]>; +class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, + Requires<[HasAVX2]>; + // AES Instruction Templates: // // AES8I // These use the same encoding as the SSE4.2 T8 and TA encodings. class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, - Requires<[HasAES]>; + list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, + Requires<[HasSSE2, HasAES]>; class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, - Requires<[HasAES]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + Requires<[HasSSE2, HasAES]>; // CLMUL Instruction Templates class CLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, - OpSize, Requires<[HasCLMUL]>; + list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + OpSize, Requires<[HasSSE2, HasCLMUL]>; class AVXCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern> - : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, VEX_4V, Requires<[HasAVX, HasCLMUL]>; // FMA3 Instruction Templates class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern> - : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, T8, OpSize, VEX_4V, Requires<[HasFMA3]>; +// FMA4 Instruction Templates +class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>; + +// XOP 2, 3 and 4 Operand Instruction Template +class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, + XOP, XOP9, Requires<[HasXOP]>; + +// XOP 2, 3 and 4 Operand Instruction Templates with imm byte +class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, + XOP, XOP8, Requires<[HasXOP]>; + +// XOP 5 operand instruction (VEX encoding!) +class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, + OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>; + // X86-64 Instruction templates... // -class RI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, REX_W; +class RI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, REX_W; class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, REX_W; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W; class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii32<o, F, outs, ins, asm, pattern>, REX_W; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W; class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm, - list<dag> pattern> - : X86Inst<o, f, Imm64, outs, ins, asm>, REX_W { + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W { let Pattern = pattern; let CodeSize = 3; } class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : SSI<o, F, outs, ins, asm, pattern>, REX_W; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W; class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : SDI<o, F, outs, ins, asm, pattern>, REX_W; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : SDI<o, F, outs, ins, asm, pattern, itin>, REX_W; class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : PDI<o, F, outs, ins, asm, pattern>, REX_W; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : PDI<o, F, outs, ins, asm, pattern, itin>, REX_W; class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : VPDI<o, F, outs, ins, asm, pattern>, VEX_W; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W; // MMX Instruction templates // @@ -519,23 +595,23 @@ class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm, // MMXID - MMX instructions with XD prefix. // MMXIS - MMX instructions with XS prefix. class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>; class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX,In64BitMode]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>; class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, REX_W, Requires<[HasMMX]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, TB, REX_W, Requires<[HasMMX]>; class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasMMX]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : I<o, F, outs, ins, asm, pattern, itin>, TB, OpSize, Requires<[HasMMX]>; class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>; class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>; class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>; + list<dag> pattern, InstrItinClass itin = IIC_DEFAULT> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index af919fb..041a64f 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1,10 +1,10 @@ -//======- X86InstrFragmentsSIMD.td - x86 ISA -------------*- tablegen -*-=====// +//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file provides pattern fragments useful for SIMD instructions. @@ -41,24 +41,20 @@ def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; +def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; +def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>; def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86pshufb : SDNode<"X86ISD::PSHUFB", - SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86andnp : SDNode<"X86ISD::ANDNP", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; -def X86psignb : SDNode<"X86ISD::PSIGNB", - SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; -def X86psignw : SDNode<"X86ISD::PSIGNW", - SDTypeProfile<1, 2, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; -def X86psignd : SDNode<"X86ISD::PSIGND", - SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisSameAs<0,1>, +def X86psign : SDNode<"X86ISD::PSIGN", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86pextrb : SDNode<"X86ISD::PEXTRB", SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; @@ -75,20 +71,30 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS", SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; +def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>; + def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>; -def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>; -def X86cmpps : SDNode<"X86ISD::CMPPS", SDTX86VFCMP>; -def X86cmppd : SDNode<"X86ISD::CMPPD", SDTX86VFCMP>; -def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>; -def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>; -def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>; -def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>; -def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>; -def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>; -def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>; -def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>; +def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; +def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; +def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; +def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; + +def X86vshl : SDNode<"X86ISD::VSHL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; +def X86vsrl : SDNode<"X86ISD::VSRL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; +def X86vsra : SDNode<"X86ISD::VSRA", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; + +def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; +def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; +def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, @@ -96,6 +102,17 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; +def X86vpcom : SDNode<"X86ISD::VPCOM", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>; +def X86vpcomu : SDNode<"X86ISD::VPCOMU", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>; + +def X86pmuludq : SDNode<"X86ISD::PMULUDQ", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; + // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get // translated into one of the target nodes below during lowering. // Note: this is a work in progress... @@ -109,6 +126,8 @@ def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, +SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>; @@ -116,8 +135,7 @@ def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>; -def X86Shufpd : SDNode<"X86ISD::SHUFPD", SDTShuff3OpI>; -def X86Shufps : SDNode<"X86ISD::SHUFPS", SDTShuff3OpI>; +def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>; def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>; def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>; @@ -129,40 +147,23 @@ def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>; def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>; def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>; def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; -def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; -def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>; -def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>; -def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>; -def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>; - -def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>; -def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>; -def X86Unpckhpsy : SDNode<"X86ISD::VUNPCKHPSY", SDTShuff2Op>; -def X86Unpckhpdy : SDNode<"X86ISD::VUNPCKHPDY", SDTShuff2Op>; - -def X86Punpcklbw : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>; -def X86Punpcklwd : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>; -def X86Punpckldq : SDNode<"X86ISD::PUNPCKLDQ", SDTShuff2Op>; -def X86Punpcklqdq : SDNode<"X86ISD::PUNPCKLQDQ", SDTShuff2Op>; - -def X86Punpckhbw : SDNode<"X86ISD::PUNPCKHBW", SDTShuff2Op>; -def X86Punpckhwd : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>; -def X86Punpckhdq : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>; -def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>; +def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; +def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; -def X86VPermilps : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>; -def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>; -def X86VPermilpd : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>; -def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>; +def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>; -def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>; +def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>; +def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>; +def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// @@ -195,15 +196,15 @@ def sdmem : Operand<v2f64> { //===----------------------------------------------------------------------===// // 128-bit load pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; -def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>; def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; // 256-bit load pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; -def loadv8i32 : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>; def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; // Like 'store', but always requires 128-bit vector alignment. @@ -223,6 +224,11 @@ def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast<LoadSDNode>(N)->getAlignment() >= 16; }]>; +// Like 'X86vzload', but always requires 128-bit vector alignment. +def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{ + return cast<MemSDNode>(N)->getAlignment() >= 16; +}]>; + // Like 'load', but always requires 256-bit vector alignment. def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast<LoadSDNode>(N)->getAlignment() >= 32; @@ -234,22 +240,20 @@ def alignedloadfsf64 : PatFrag<(ops node:$ptr), (f64 (alignedload node:$ptr))>; // 128-bit aligned load pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (alignedload node:$ptr))>; -def alignedloadv4i32 : PatFrag<(ops node:$ptr), - (v4i32 (alignedload node:$ptr))>; def alignedloadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (alignedload node:$ptr))>; // 256-bit aligned load pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 def alignedloadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (alignedload256 node:$ptr))>; def alignedloadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (alignedload256 node:$ptr))>; -def alignedloadv8i32 : PatFrag<(ops node:$ptr), - (v8i32 (alignedload256 node:$ptr))>; def alignedloadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (alignedload256 node:$ptr))>; @@ -268,19 +272,16 @@ def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; // 128-bit memop pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; -def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>; def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; -def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>; -def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>; // 256-bit memop pattern fragments -def memopv32i8 : PatFrag<(ops node:$ptr), (v32i8 (memop node:$ptr))>; +// NOTE: all 256-bit integer vector loads are promoted to v4i64 def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; -def memopv8i32 : PatFrag<(ops node:$ptr), (v8i32 (memop node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. @@ -326,6 +327,8 @@ def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; // 256-bit bitconvert pattern fragments +def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>; +def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>; def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; @@ -350,30 +353,6 @@ def BYTE_imm : SDNodeXForm<imm, [{ return getI32Imm(N->getZExtValue() >> 3); }]>; -// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*, -// SHUFP* etc. imm. -def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{ - return getI8Imm(X86::getShuffleSHUFImmediate(N)); -}]>; - -// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to -// PSHUFHW imm. -def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{ - return getI8Imm(X86::getShufflePSHUFHWImmediate(N)); -}]>; - -// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to -// PSHUFLW imm. -def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{ - return getI8Imm(X86::getShufflePSHUFLWImmediate(N)); -}]>; - -// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to -// a PALIGNR imm. -def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{ - return getI8Imm(X86::getShufflePALIGNRImmediate(N)); -}]>; - // EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index // to VEXTRACTF128 imm. def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{ @@ -386,72 +365,6 @@ def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{ return getI8Imm(X86::getInsertVINSERTF128Immediate(N)); }]>; -def splat_lo : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - return SVOp->isSplat() && SVOp->getSplatIndex() == 0; -}]>; - -def movddup : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N)); -}]>; - -def movhlps : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N)); -}]>; - -def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N)); -}]>; - -def movlhps : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVLHPSMask(cast<ShuffleVectorSDNode>(N)); -}]>; - -def movlp : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N)); -}]>; - -def movl : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N)); -}]>; - -def unpckl : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N)); -}]>; - -def unpckh : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N)); -}]>; - -def pshufd : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N)); -}], SHUFFLE_get_shuf_imm>; - -def shufp : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N)); -}], SHUFFLE_get_shuf_imm>; - -def pshufhw : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N)); -}], SHUFFLE_get_pshufhw_imm>; - -def pshuflw : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N)); -}], SHUFFLE_get_pshuflw_imm>; - def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, node:$index), [{ diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 3a02de0..307c96b 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1,4 +1,4 @@ -//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===// +//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// // // The LLVM Compiler Infrastructure // @@ -25,14 +25,13 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/LiveVariables.h" -#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/MC/MCAsmInfo.h" #include <limits> #define GET_INSTRINFO_CTOR @@ -83,6 +82,12 @@ enum { TB_FOLDED_STORE = 1 << 19 }; +struct X86OpTblEntry { + uint16_t RegOp; + uint16_t MemOp; + uint32_t Flags; +}; + X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit() ? X86::ADJCALLSTACKDOWN64 @@ -92,7 +97,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) : X86::ADJCALLSTACKUP32)), TM(tm), RI(tm, *this) { - static const unsigned OpTbl2Addr[][3] = { + static const X86OpTblEntry OpTbl2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, { X86::ADC32ri8, X86::ADC32mi8, 0 }, { X86::ADC32rr, X86::ADC32mr, 0 }, @@ -260,22 +265,21 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) }; for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) { - unsigned RegOp = OpTbl2Addr[i][0]; - unsigned MemOp = OpTbl2Addr[i][1]; - unsigned Flags = OpTbl2Addr[i][2]; + unsigned RegOp = OpTbl2Addr[i].RegOp; + unsigned MemOp = OpTbl2Addr[i].MemOp; + unsigned Flags = OpTbl2Addr[i].Flags; AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, RegOp, MemOp, // Index 0, folded load and store, no alignment requirement. Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); } - static const unsigned OpTbl0[][3] = { + static const X86OpTblEntry OpTbl0[] = { { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD }, { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD }, - { X86::WINCALL64r, X86::WINCALL64m, TB_FOLDED_LOAD }, { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD }, { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD }, { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD }, @@ -352,6 +356,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::FsVMOVAPDrr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::FsVMOVAPSrr, X86::VMOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -362,6 +367,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, // AVX 256-bit foldable instructions + { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -370,14 +376,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) }; for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { - unsigned RegOp = OpTbl0[i][0]; - unsigned MemOp = OpTbl0[i][1]; - unsigned Flags = OpTbl0[i][2]; + unsigned RegOp = OpTbl0[i].RegOp; + unsigned MemOp = OpTbl0[i].MemOp; + unsigned Flags = OpTbl0[i].Flags; AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, RegOp, MemOp, TB_INDEX_0 | Flags); } - static const unsigned OpTbl1[][3] = { + static const X86OpTblEntry OpTbl1[] = { { X86::CMP16rr, X86::CMP16rm, 0 }, { X86::CMP32rr, X86::CMP32rm, 0 }, { X86::CMP64rr, X86::CMP64rm, 0 }, @@ -456,6 +462,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 }, { X86::MOVZX64rr32, X86::MOVZX64rm32, 0 }, { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 }, + { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, + { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, + { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, @@ -508,6 +517,11 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVZDI2PDIrr, X86::VMOVZDI2PDIrm, 0 }, { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::VPABSBrr128, X86::VPABSBrm128, TB_ALIGN_16 }, + { X86::VPABSDrr128, X86::VPABSDrm128, TB_ALIGN_16 }, + { X86::VPABSWrr128, X86::VPABSWrm128, TB_ALIGN_16 }, + { X86::VPERMILPDri, X86::VPERMILPDmi, TB_ALIGN_16 }, + { X86::VPERMILPSri, X86::VPERMILPSmi, TB_ALIGN_16 }, { X86::VPSHUFDri, X86::VPSHUFDmi, TB_ALIGN_16 }, { X86::VPSHUFHWri, X86::VPSHUFHWmi, TB_ALIGN_16 }, { X86::VPSHUFLWri, X86::VPSHUFLWmi, TB_ALIGN_16 }, @@ -524,22 +538,39 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) // AVX 256-bit foldable instructions { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, - { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_16 }, + { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, - { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 } + { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, + { X86::VPERMILPDYri, X86::VPERMILPDYmi, TB_ALIGN_32 }, + { X86::VPERMILPSYri, X86::VPERMILPSYmi, TB_ALIGN_32 }, + // AVX2 foldable instructions + { X86::VPABSBrr256, X86::VPABSBrm256, TB_ALIGN_32 }, + { X86::VPABSDrr256, X86::VPABSDrm256, TB_ALIGN_32 }, + { X86::VPABSWrr256, X86::VPABSWrm256, TB_ALIGN_32 }, + { X86::VPSHUFDYri, X86::VPSHUFDYmi, TB_ALIGN_32 }, + { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, TB_ALIGN_32 }, + { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, TB_ALIGN_32 }, + { X86::VRCPPSYr, X86::VRCPPSYm, TB_ALIGN_32 }, + { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, TB_ALIGN_32 }, + { X86::VRSQRTPSYr, X86::VRSQRTPSYm, TB_ALIGN_32 }, + { X86::VRSQRTPSYr_Int, X86::VRSQRTPSYm_Int, TB_ALIGN_32 }, + { X86::VSQRTPDYr, X86::VSQRTPDYm, TB_ALIGN_32 }, + { X86::VSQRTPDYr_Int, X86::VSQRTPDYm_Int, TB_ALIGN_32 }, + { X86::VSQRTPSYr, X86::VSQRTPSYm, TB_ALIGN_32 }, + { X86::VSQRTPSYr_Int, X86::VSQRTPSYm_Int, TB_ALIGN_32 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { - unsigned RegOp = OpTbl1[i][0]; - unsigned MemOp = OpTbl1[i][1]; - unsigned Flags = OpTbl1[i][2]; + unsigned RegOp = OpTbl1[i].RegOp; + unsigned MemOp = OpTbl1[i].MemOp; + unsigned Flags = OpTbl1[i].Flags; AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, RegOp, MemOp, // Index 1, folded load Flags | TB_INDEX_1 | TB_FOLDED_LOAD); } - static const unsigned OpTbl2[][3] = { + static const X86OpTblEntry OpTbl2[] = { { X86::ADC32rr, X86::ADC32rm, 0 }, { X86::ADC64rr, X86::ADC64rm, 0 }, { X86::ADD16rr, X86::ADD16rm, 0 }, @@ -563,6 +594,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 }, { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 }, { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 }, + { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 }, + { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 }, + { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 }, + { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 }, { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, @@ -652,6 +687,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, { X86::MINSSrr, X86::MINSSrm, 0 }, { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, + { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, { X86::MULSDrr, X86::MULSDrm, 0 }, @@ -664,30 +700,45 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 }, { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 }, { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 }, + { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 }, { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 }, { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 }, { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 }, { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 }, { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 }, { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 }, + { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 }, + { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 }, { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 }, + { X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 }, { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 }, { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, + { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 }, { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, + { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 }, { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 }, { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 }, { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 }, + { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 }, { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 }, + { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 }, + { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 }, + { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 }, + { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 }, + { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 }, + { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 }, { X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 }, + { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 }, { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, + { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 }, { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, @@ -695,6 +746,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 }, { X86::PORrr, X86::PORrm, TB_ALIGN_16 }, { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 }, + { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 }, + { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 }, + { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 }, + { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 }, { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 }, { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 }, { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 }, @@ -778,6 +833,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VANDNPSrr, X86::VANDNPSrm, TB_ALIGN_16 }, { X86::VANDPDrr, X86::VANDPDrm, TB_ALIGN_16 }, { X86::VANDPSrr, X86::VANDPSrm, TB_ALIGN_16 }, + { X86::VBLENDPDrri, X86::VBLENDPDrmi, TB_ALIGN_16 }, + { X86::VBLENDPSrri, X86::VBLENDPSrmi, TB_ALIGN_16 }, + { X86::VBLENDVPDrr, X86::VBLENDVPDrm, TB_ALIGN_16 }, + { X86::VBLENDVPSrr, X86::VBLENDVPSrm, TB_ALIGN_16 }, { X86::VCMPPDrri, X86::VCMPPDrmi, TB_ALIGN_16 }, { X86::VCMPPSrri, X86::VCMPPSrmi, TB_ALIGN_16 }, { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, @@ -816,6 +875,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, + { X86::VMPSADBWrri, X86::VMPSADBWrmi, TB_ALIGN_16 }, { X86::VMULPDrr, X86::VMULPDrm, TB_ALIGN_16 }, { X86::VMULPSrr, X86::VMULPSrm, TB_ALIGN_16 }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, @@ -824,28 +884,47 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VORPSrr, X86::VORPSrm, TB_ALIGN_16 }, { X86::VPACKSSDWrr, X86::VPACKSSDWrm, TB_ALIGN_16 }, { X86::VPACKSSWBrr, X86::VPACKSSWBrm, TB_ALIGN_16 }, + { X86::VPACKUSDWrr, X86::VPACKUSDWrm, TB_ALIGN_16 }, { X86::VPACKUSWBrr, X86::VPACKUSWBrm, TB_ALIGN_16 }, { X86::VPADDBrr, X86::VPADDBrm, TB_ALIGN_16 }, { X86::VPADDDrr, X86::VPADDDrm, TB_ALIGN_16 }, { X86::VPADDQrr, X86::VPADDQrm, TB_ALIGN_16 }, { X86::VPADDSBrr, X86::VPADDSBrm, TB_ALIGN_16 }, { X86::VPADDSWrr, X86::VPADDSWrm, TB_ALIGN_16 }, + { X86::VPADDUSBrr, X86::VPADDUSBrm, TB_ALIGN_16 }, + { X86::VPADDUSWrr, X86::VPADDUSWrm, TB_ALIGN_16 }, { X86::VPADDWrr, X86::VPADDWrm, TB_ALIGN_16 }, + { X86::VPALIGNR128rr, X86::VPALIGNR128rm, TB_ALIGN_16 }, { X86::VPANDNrr, X86::VPANDNrm, TB_ALIGN_16 }, { X86::VPANDrr, X86::VPANDrm, TB_ALIGN_16 }, + { X86::VPAVGBrr, X86::VPAVGBrm, TB_ALIGN_16 }, + { X86::VPAVGWrr, X86::VPAVGWrm, TB_ALIGN_16 }, + { X86::VPBLENDWrri, X86::VPBLENDWrmi, TB_ALIGN_16 }, { X86::VPCMPEQBrr, X86::VPCMPEQBrm, TB_ALIGN_16 }, { X86::VPCMPEQDrr, X86::VPCMPEQDrm, TB_ALIGN_16 }, + { X86::VPCMPEQQrr, X86::VPCMPEQQrm, TB_ALIGN_16 }, { X86::VPCMPEQWrr, X86::VPCMPEQWrm, TB_ALIGN_16 }, { X86::VPCMPGTBrr, X86::VPCMPGTBrm, TB_ALIGN_16 }, { X86::VPCMPGTDrr, X86::VPCMPGTDrm, TB_ALIGN_16 }, + { X86::VPCMPGTQrr, X86::VPCMPGTQrm, TB_ALIGN_16 }, { X86::VPCMPGTWrr, X86::VPCMPGTWrm, TB_ALIGN_16 }, + { X86::VPHADDDrr, X86::VPHADDDrm, TB_ALIGN_16 }, + { X86::VPHADDSWrr128, X86::VPHADDSWrm128, TB_ALIGN_16 }, + { X86::VPHADDWrr, X86::VPHADDWrm, TB_ALIGN_16 }, + { X86::VPHSUBDrr, X86::VPHSUBDrm, TB_ALIGN_16 }, + { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, TB_ALIGN_16 }, + { X86::VPHSUBWrr, X86::VPHSUBWrm, TB_ALIGN_16 }, + { X86::VPERMILPDrr, X86::VPERMILPDrm, TB_ALIGN_16 }, + { X86::VPERMILPSrr, X86::VPERMILPSrm, TB_ALIGN_16 }, { X86::VPINSRWrri, X86::VPINSRWrmi, TB_ALIGN_16 }, + { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, TB_ALIGN_16 }, { X86::VPMADDWDrr, X86::VPMADDWDrm, TB_ALIGN_16 }, { X86::VPMAXSWrr, X86::VPMAXSWrm, TB_ALIGN_16 }, { X86::VPMAXUBrr, X86::VPMAXUBrm, TB_ALIGN_16 }, { X86::VPMINSWrr, X86::VPMINSWrm, TB_ALIGN_16 }, { X86::VPMINUBrr, X86::VPMINUBrm, TB_ALIGN_16 }, { X86::VPMULDQrr, X86::VPMULDQrm, TB_ALIGN_16 }, + { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, TB_ALIGN_16 }, { X86::VPMULHUWrr, X86::VPMULHUWrm, TB_ALIGN_16 }, { X86::VPMULHWrr, X86::VPMULHWrm, TB_ALIGN_16 }, { X86::VPMULLDrr, X86::VPMULLDrm, TB_ALIGN_16 }, @@ -853,6 +932,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VPMULUDQrr, X86::VPMULUDQrm, TB_ALIGN_16 }, { X86::VPORrr, X86::VPORrm, TB_ALIGN_16 }, { X86::VPSADBWrr, X86::VPSADBWrm, TB_ALIGN_16 }, + { X86::VPSHUFBrr, X86::VPSHUFBrm, TB_ALIGN_16 }, + { X86::VPSIGNBrr, X86::VPSIGNBrm, TB_ALIGN_16 }, + { X86::VPSIGNWrr, X86::VPSIGNWrm, TB_ALIGN_16 }, + { X86::VPSIGNDrr, X86::VPSIGNDrm, TB_ALIGN_16 }, { X86::VPSLLDrr, X86::VPSLLDrm, TB_ALIGN_16 }, { X86::VPSLLQrr, X86::VPSLLQrm, TB_ALIGN_16 }, { X86::VPSLLWrr, X86::VPSLLWrm, TB_ALIGN_16 }, @@ -886,14 +969,154 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, TB_ALIGN_16 }, { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, TB_ALIGN_16 }, { X86::VXORPDrr, X86::VXORPDrm, TB_ALIGN_16 }, - { X86::VXORPSrr, X86::VXORPSrm, TB_ALIGN_16 } + { X86::VXORPSrr, X86::VXORPSrm, TB_ALIGN_16 }, + // AVX 256-bit foldable instructions + { X86::VADDPDYrr, X86::VADDPDYrm, TB_ALIGN_32 }, + { X86::VADDPSYrr, X86::VADDPSYrm, TB_ALIGN_32 }, + { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, TB_ALIGN_32 }, + { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, TB_ALIGN_32 }, + { X86::VANDNPDYrr, X86::VANDNPDYrm, TB_ALIGN_32 }, + { X86::VANDNPSYrr, X86::VANDNPSYrm, TB_ALIGN_32 }, + { X86::VANDPDYrr, X86::VANDPDYrm, TB_ALIGN_32 }, + { X86::VANDPSYrr, X86::VANDPSYrm, TB_ALIGN_32 }, + { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, TB_ALIGN_32 }, + { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, TB_ALIGN_32 }, + { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, TB_ALIGN_32 }, + { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, TB_ALIGN_32 }, + { X86::VCMPPDYrri, X86::VCMPPDYrmi, TB_ALIGN_32 }, + { X86::VCMPPSYrri, X86::VCMPPSYrmi, TB_ALIGN_32 }, + { X86::VDIVPDYrr, X86::VDIVPDYrm, TB_ALIGN_32 }, + { X86::VDIVPSYrr, X86::VDIVPSYrm, TB_ALIGN_32 }, + { X86::VHADDPDYrr, X86::VHADDPDYrm, TB_ALIGN_32 }, + { X86::VHADDPSYrr, X86::VHADDPSYrm, TB_ALIGN_32 }, + { X86::VHSUBPDYrr, X86::VHSUBPDYrm, TB_ALIGN_32 }, + { X86::VHSUBPSYrr, X86::VHSUBPSYrm, TB_ALIGN_32 }, + { X86::VINSERTF128rr, X86::VINSERTF128rm, TB_ALIGN_32 }, + { X86::VMAXPDYrr, X86::VMAXPDYrm, TB_ALIGN_32 }, + { X86::VMAXPDYrr_Int, X86::VMAXPDYrm_Int, TB_ALIGN_32 }, + { X86::VMAXPSYrr, X86::VMAXPSYrm, TB_ALIGN_32 }, + { X86::VMAXPSYrr_Int, X86::VMAXPSYrm_Int, TB_ALIGN_32 }, + { X86::VMINPDYrr, X86::VMINPDYrm, TB_ALIGN_32 }, + { X86::VMINPDYrr_Int, X86::VMINPDYrm_Int, TB_ALIGN_32 }, + { X86::VMINPSYrr, X86::VMINPSYrm, TB_ALIGN_32 }, + { X86::VMINPSYrr_Int, X86::VMINPSYrm_Int, TB_ALIGN_32 }, + { X86::VMULPDYrr, X86::VMULPDYrm, TB_ALIGN_32 }, + { X86::VMULPSYrr, X86::VMULPSYrm, TB_ALIGN_32 }, + { X86::VORPDYrr, X86::VORPDYrm, TB_ALIGN_32 }, + { X86::VORPSYrr, X86::VORPSYrm, TB_ALIGN_32 }, + { X86::VPERM2F128rr, X86::VPERM2F128rm, TB_ALIGN_32 }, + { X86::VPERMILPDYrr, X86::VPERMILPDYrm, TB_ALIGN_32 }, + { X86::VPERMILPSYrr, X86::VPERMILPSYrm, TB_ALIGN_32 }, + { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, TB_ALIGN_32 }, + { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, TB_ALIGN_32 }, + { X86::VSUBPDYrr, X86::VSUBPDYrm, TB_ALIGN_32 }, + { X86::VSUBPSYrr, X86::VSUBPSYrm, TB_ALIGN_32 }, + { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, TB_ALIGN_32 }, + { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, TB_ALIGN_32 }, + { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, TB_ALIGN_32 }, + { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, TB_ALIGN_32 }, + { X86::VXORPDYrr, X86::VXORPDYrm, TB_ALIGN_32 }, + { X86::VXORPSYrr, X86::VXORPSYrm, TB_ALIGN_32 }, + // AVX2 foldable instructions + { X86::VINSERTI128rr, X86::VINSERTI128rm, TB_ALIGN_16 }, + { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, TB_ALIGN_32 }, + { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, TB_ALIGN_32 }, + { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, TB_ALIGN_32 }, + { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, TB_ALIGN_32 }, + { X86::VPADDBYrr, X86::VPADDBYrm, TB_ALIGN_32 }, + { X86::VPADDDYrr, X86::VPADDDYrm, TB_ALIGN_32 }, + { X86::VPADDQYrr, X86::VPADDQYrm, TB_ALIGN_32 }, + { X86::VPADDSBYrr, X86::VPADDSBYrm, TB_ALIGN_32 }, + { X86::VPADDSWYrr, X86::VPADDSWYrm, TB_ALIGN_32 }, + { X86::VPADDUSBYrr, X86::VPADDUSBYrm, TB_ALIGN_32 }, + { X86::VPADDUSWYrr, X86::VPADDUSWYrm, TB_ALIGN_32 }, + { X86::VPADDWYrr, X86::VPADDWYrm, TB_ALIGN_32 }, + { X86::VPALIGNR256rr, X86::VPALIGNR256rm, TB_ALIGN_32 }, + { X86::VPANDNYrr, X86::VPANDNYrm, TB_ALIGN_32 }, + { X86::VPANDYrr, X86::VPANDYrm, TB_ALIGN_32 }, + { X86::VPAVGBYrr, X86::VPAVGBYrm, TB_ALIGN_32 }, + { X86::VPAVGWYrr, X86::VPAVGWYrm, TB_ALIGN_32 }, + { X86::VPBLENDDrri, X86::VPBLENDDrmi, TB_ALIGN_32 }, + { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, TB_ALIGN_32 }, + { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, TB_ALIGN_32 }, + { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, TB_ALIGN_32 }, + { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, TB_ALIGN_32 }, + { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, TB_ALIGN_32 }, + { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, TB_ALIGN_32 }, + { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, TB_ALIGN_32 }, + { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, TB_ALIGN_32 }, + { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, TB_ALIGN_32 }, + { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, TB_ALIGN_32 }, + { X86::VPERM2I128rr, X86::VPERM2I128rm, TB_ALIGN_32 }, + { X86::VPERMDYrr, X86::VPERMDYrm, TB_ALIGN_32 }, + { X86::VPERMPDYrr, X86::VPERMPDYrm, TB_ALIGN_32 }, + { X86::VPERMPSYrr, X86::VPERMPSYrm, TB_ALIGN_32 }, + { X86::VPERMQYrr, X86::VPERMQYrm, TB_ALIGN_32 }, + { X86::VPHADDDYrr, X86::VPHADDDYrm, TB_ALIGN_32 }, + { X86::VPHADDSWrr256, X86::VPHADDSWrm256, TB_ALIGN_32 }, + { X86::VPHADDWYrr, X86::VPHADDWYrm, TB_ALIGN_32 }, + { X86::VPHSUBDYrr, X86::VPHSUBDYrm, TB_ALIGN_32 }, + { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, TB_ALIGN_32 }, + { X86::VPHSUBWYrr, X86::VPHSUBWYrm, TB_ALIGN_32 }, + { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, TB_ALIGN_32 }, + { X86::VPMADDWDYrr, X86::VPMADDWDYrm, TB_ALIGN_32 }, + { X86::VPMAXSWYrr, X86::VPMAXSWYrm, TB_ALIGN_32 }, + { X86::VPMAXUBYrr, X86::VPMAXUBYrm, TB_ALIGN_32 }, + { X86::VPMINSWYrr, X86::VPMINSWYrm, TB_ALIGN_32 }, + { X86::VPMINUBYrr, X86::VPMINUBYrm, TB_ALIGN_32 }, + { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, TB_ALIGN_32 }, + { X86::VPMULDQYrr, X86::VPMULDQYrm, TB_ALIGN_32 }, + { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, TB_ALIGN_32 }, + { X86::VPMULHUWYrr, X86::VPMULHUWYrm, TB_ALIGN_32 }, + { X86::VPMULHWYrr, X86::VPMULHWYrm, TB_ALIGN_32 }, + { X86::VPMULLDYrr, X86::VPMULLDYrm, TB_ALIGN_32 }, + { X86::VPMULLWYrr, X86::VPMULLWYrm, TB_ALIGN_32 }, + { X86::VPMULUDQYrr, X86::VPMULUDQYrm, TB_ALIGN_32 }, + { X86::VPORYrr, X86::VPORYrm, TB_ALIGN_32 }, + { X86::VPSADBWYrr, X86::VPSADBWYrm, TB_ALIGN_32 }, + { X86::VPSHUFBYrr, X86::VPSHUFBYrm, TB_ALIGN_32 }, + { X86::VPSIGNBYrr, X86::VPSIGNBYrm, TB_ALIGN_32 }, + { X86::VPSIGNWYrr, X86::VPSIGNWYrm, TB_ALIGN_32 }, + { X86::VPSIGNDYrr, X86::VPSIGNDYrm, TB_ALIGN_32 }, + { X86::VPSLLDYrr, X86::VPSLLDYrm, TB_ALIGN_16 }, + { X86::VPSLLQYrr, X86::VPSLLQYrm, TB_ALIGN_16 }, + { X86::VPSLLWYrr, X86::VPSLLWYrm, TB_ALIGN_16 }, + { X86::VPSLLVDrr, X86::VPSLLVDrm, TB_ALIGN_16 }, + { X86::VPSLLVDYrr, X86::VPSLLVDYrm, TB_ALIGN_32 }, + { X86::VPSLLVQrr, X86::VPSLLVQrm, TB_ALIGN_16 }, + { X86::VPSLLVQYrr, X86::VPSLLVQYrm, TB_ALIGN_32 }, + { X86::VPSRADYrr, X86::VPSRADYrm, TB_ALIGN_16 }, + { X86::VPSRAWYrr, X86::VPSRAWYrm, TB_ALIGN_16 }, + { X86::VPSRAVDrr, X86::VPSRAVDrm, TB_ALIGN_16 }, + { X86::VPSRAVDYrr, X86::VPSRAVDYrm, TB_ALIGN_32 }, + { X86::VPSRLDYrr, X86::VPSRLDYrm, TB_ALIGN_16 }, + { X86::VPSRLQYrr, X86::VPSRLQYrm, TB_ALIGN_16 }, + { X86::VPSRLWYrr, X86::VPSRLWYrm, TB_ALIGN_16 }, + { X86::VPSRLVDrr, X86::VPSRLVDrm, TB_ALIGN_16 }, + { X86::VPSRLVDYrr, X86::VPSRLVDYrm, TB_ALIGN_32 }, + { X86::VPSRLVQrr, X86::VPSRLVQrm, TB_ALIGN_16 }, + { X86::VPSRLVQYrr, X86::VPSRLVQYrm, TB_ALIGN_32 }, + { X86::VPSUBBYrr, X86::VPSUBBYrm, TB_ALIGN_32 }, + { X86::VPSUBDYrr, X86::VPSUBDYrm, TB_ALIGN_32 }, + { X86::VPSUBSBYrr, X86::VPSUBSBYrm, TB_ALIGN_32 }, + { X86::VPSUBSWYrr, X86::VPSUBSWYrm, TB_ALIGN_32 }, + { X86::VPSUBWYrr, X86::VPSUBWYrm, TB_ALIGN_32 }, + { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, TB_ALIGN_32 }, + { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, TB_ALIGN_32 }, + { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, TB_ALIGN_16 }, + { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, TB_ALIGN_32 }, + { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, TB_ALIGN_32 }, + { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, TB_ALIGN_32 }, + { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, TB_ALIGN_32 }, + { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, TB_ALIGN_32 }, + { X86::VPXORYrr, X86::VPXORYrm, TB_ALIGN_32 }, // FIXME: add AVX 256-bit foldable instructions }; for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { - unsigned RegOp = OpTbl2[i][0]; - unsigned MemOp = OpTbl2[i][1]; - unsigned Flags = OpTbl2[i][2]; + unsigned RegOp = OpTbl2[i].RegOp; + unsigned MemOp = OpTbl2[i].MemOp; + unsigned Flags = OpTbl2[i].Flags; AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, RegOp, MemOp, // Index 2, folded load @@ -946,7 +1169,6 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, switch (MI.getOpcode()) { default: llvm_unreachable(0); - break; case X86::MOVSX16rr8: case X86::MOVZX16rr8: case X86::MOVSX32rr8: @@ -989,7 +1211,8 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, static bool isFrameLoadOpcode(int Opcode) { switch (Opcode) { - default: break; + default: + return false; case X86::MOV8rm: case X86::MOV16rm: case X86::MOV32rm: @@ -1011,9 +1234,7 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: return true; - break; } - return false; } static bool isFrameStoreOpcode(int Opcode) { @@ -1203,6 +1424,8 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, bool SeenDef = false; for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { MachineOperand &MO = Iter->getOperand(j); + if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) + SeenDef = true; if (!MO.isReg()) continue; if (MO.getReg() == X86::EFLAGS) { @@ -1247,6 +1470,10 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, bool SawKill = false; for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { MachineOperand &MO = Iter->getOperand(j); + // A register mask may clobber EFLAGS, but we should still look for a + // live EFLAGS def. + if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) + SawKill = true; if (MO.isReg() && MO.getReg() == X86::EFLAGS) { if (MO.isDef()) return MO.isDead(); if (MO.isKill()) SawKill = true; @@ -1357,7 +1584,6 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, switch (MIOpc) { default: llvm_unreachable(0); - break; case X86::SHL16ri: { unsigned ShAmt = MI->getOperand(2).getImm(); MIB.addReg(0).addImm(1 << ShAmt) @@ -1392,9 +1618,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. - BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2); + BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2); InsMI2 = - BuildMI(*MFI, MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) + BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) .addReg(leaInReg2, RegState::Define, X86::sub_16bit) .addReg(Src2, getKillRegState(isKill2)); addRegReg(MIB, leaInReg, true, leaInReg2, true); @@ -1469,6 +1695,24 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, .addReg(B, getKillRegState(isKill)).addImm(M); break; } + case X86::SHUFPDrri: { + assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); + if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0; + + unsigned B = MI->getOperand(1).getReg(); + unsigned C = MI->getOperand(2).getReg(); + if (B != C) return 0; + unsigned A = MI->getOperand(0).getReg(); + unsigned M = MI->getOperand(3).getImm(); + + // Convert to PSHUFD mask. + M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44; + + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) + .addReg(A, RegState::Define | getDeadRegState(isDead)) + .addReg(B, getKillRegState(isKill)).addImm(M); + break; + } case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses @@ -1597,7 +1841,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD32rr_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc; - TargetRegisterClass *RC; + const TargetRegisterClass *RC; if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) { Opc = X86::LEA64r; RC = X86::GR64_NOSPRegisterClass; @@ -1904,13 +2148,12 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { } bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { - const MCInstrDesc &MCID = MI->getDesc(); - if (!MCID.isTerminator()) return false; + if (!MI->isTerminator()) return false; // Conditional branch is a special case. - if (MCID.isBranch() && !MCID.isBarrier()) + if (MI->isBranch() && !MI->isBarrier()) return true; - if (!MCID.isPredicable()) + if (!MI->isPredicable()) return true; return !isPredicated(MI); } @@ -1936,7 +2179,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, // A terminator that isn't a branch can't easily be handled by this // analysis. - if (!I->getDesc().isBranch()) + if (!I->isBranch()) return true; // Handle unconditional branches. @@ -2420,7 +2663,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); switch (MI->getOpcode()) { case X86::V_SET0: - return Expand2AddrUndef(MI, get(HasAVX ? X86::VPXORrr : X86::PXORrr)); + case X86::FsFLD0SS: + case X86::FsFLD0SD: + return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; @@ -2624,6 +2869,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, /// static bool hasPartialRegUpdate(unsigned Opcode) { switch (Opcode) { + case X86::CVTSI2SSrr: + case X86::CVTSI2SS64rr: + case X86::CVTSI2SDrr: + case X86::CVTSI2SD64rr: case X86::CVTSD2SSrr: case X86::Int_CVTSD2SSrr: case X86::CVTSS2SDrr: @@ -2631,7 +2880,9 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::RCPSSr: case X86::RCPSSr_Int: case X86::ROUNDSDr: + case X86::ROUNDSDr_Int: case X86::ROUNDSSr: + case X86::ROUNDSSr_Int: case X86::RSQRTSSr: case X86::RSQRTSSr_Int: case X86::SQRTSSr: @@ -2643,7 +2894,9 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::Int_VCVTSS2SDrr: case X86::VRCPSSr: case X86::VROUNDSDr: + case X86::VROUNDSDr_Int: case X86::VROUNDSSr: + case X86::VROUNDSSr_Int: case X86::VRSQRTSSr: case X86::VSQRTSSr: return true; @@ -2652,6 +2905,54 @@ static bool hasPartialRegUpdate(unsigned Opcode) { return false; } +/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle +/// instructions we would like before a partial register update. +unsigned X86InstrInfo:: +getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode())) + return 0; + + // If MI is marked as reading Reg, the partial register update is wanted. + const MachineOperand &MO = MI->getOperand(0); + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (MO.readsReg() || MI->readsVirtualRegister(Reg)) + return 0; + } else { + if (MI->readsRegister(Reg, TRI)) + return 0; + } + + // If any of the preceding 16 instructions are reading Reg, insert a + // dependency breaking instruction. The magic number is based on a few + // Nehalem experiments. + return 16; +} + +void X86InstrInfo:: +breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + unsigned Reg = MI->getOperand(OpNum).getReg(); + if (X86::VR128RegClass.contains(Reg)) { + // These instructions are all floating point domain, so xorps is the best + // choice. + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) + .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + } else if (X86::VR256RegClass.contains(Reg)) { + // Use vxorps to clear the full ymm register. + // It wants to read and write the xmm sub-register. + unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) + .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + } else + return; + MI->addRegisterKilled(Reg, TRI, true); +} + MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, @@ -2714,6 +3015,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, switch (LoadMI->getOpcode()) { case X86::AVX_SET0PSY: case X86::AVX_SET0PDY: + case X86::AVX2_SETALLONES: + case X86::AVX2_SET0: Alignment = 32; break; case X86::V_SET0: @@ -2722,11 +3025,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = 16; break; case X86::FsFLD0SD: - case X86::VFsFLD0SD: Alignment = 8; break; case X86::FsFLD0SS: - case X86::VFsFLD0SS: Alignment = 4; break; default: @@ -2759,10 +3060,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, case X86::AVX_SET0PSY: case X86::AVX_SET0PDY: case X86::AVX_SETALLONES: + case X86::AVX2_SETALLONES: + case X86::AVX2_SET0: case X86::FsFLD0SD: - case X86::FsFLD0SS: - case X86::VFsFLD0SD: - case X86::VFsFLD0SS: { + case X86::FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. @@ -2788,16 +3089,19 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; unsigned Opc = LoadMI->getOpcode(); - if (Opc == X86::FsFLD0SS || Opc == X86::VFsFLD0SS) + if (Opc == X86::FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction()->getContext()); - else if (Opc == X86::FsFLD0SD || Opc == X86::VFsFLD0SD) + else if (Opc == X86::FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY) Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8); + else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX2_SET0) + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); - bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES); + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES || + Opc == X86::AVX2_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); @@ -3329,7 +3633,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { // These are the replaceable SSE instructions. Some of these have Int variants // that we don't include here. We don't want to replace instructions selected // by intrinsics. -static const unsigned ReplaceableInstrs[][3] = { +static const uint16_t ReplaceableInstrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr }, { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm }, @@ -3366,31 +3670,66 @@ static const unsigned ReplaceableInstrs[][3] = { { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, - { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }, + { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr } +}; + +static const uint16_t ReplaceableInstrsAVX2[][3] = { + //PackedSingle PackedDouble PackedInt + { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm }, + { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr }, + { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm }, + { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr }, + { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm }, + { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr }, + { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm }, + { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }, + { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, + { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr }, + { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, + { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, + { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm }, + { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr } }; // FIXME: Some shuffle and unpack instructions have equivalents in different // domains, but they require a bit more work than just switching opcodes. -static const unsigned *lookup(unsigned opcode, unsigned domain) { +static const uint16_t *lookup(unsigned opcode, unsigned domain) { for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) if (ReplaceableInstrs[i][domain-1] == opcode) return ReplaceableInstrs[i]; return 0; } +static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { + for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) + if (ReplaceableInstrsAVX2[i][domain-1] == opcode) + return ReplaceableInstrsAVX2[i]; + return 0; +} + std::pair<uint16_t, uint16_t> X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; - return std::make_pair(domain, - domain && lookup(MI->getOpcode(), domain) ? 0xe : 0); + bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2(); + uint16_t validDomains = 0; + if (domain && lookup(MI->getOpcode(), domain)) + validDomains = 0xe; + else if (domain && lookupAVX2(MI->getOpcode(), domain)) + validDomains = hasAVX2 ? 0xe : 0x6; + return std::make_pair(domain, validDomains); } void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); - const unsigned *table = lookup(MI->getOpcode(), dom); + const uint16_t *table = lookup(MI->getOpcode(), dom); + if (!table) { // try the other table + assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) && + "256-bit vector operations only available in AVX2"); + table = lookupAVX2(MI->getOpcode(), dom); + } assert(table && "Cannot change domain"); MI->setDesc(get(table[Domain-1])); } diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index 97009db..b23d756 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -1,4 +1,4 @@ -//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===// +//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -14,10 +14,10 @@ #ifndef X86INSTRUCTIONINFO_H #define X86INSTRUCTIONINFO_H -#include "llvm/Target/TargetInstrInfo.h" #include "X86.h" #include "X86RegisterInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Target/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "X86GenInstrInfo.inc" @@ -345,6 +345,11 @@ public: void setExecutionDomain(MachineInstr *MI, unsigned Domain) const; + unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const; + void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const; + MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, unsigned OpNum, diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index d54bf27..6a25312 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -1,4 +1,4 @@ -//===- X86InstrInfo.td - Main X86 Instruction Definition ---*- tablegen -*-===// +//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -99,17 +99,16 @@ def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; +def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>; + def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; -def SDT_X86MEMBARRIERNoSSE : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, [SDNPHasChain]>; -def X86MemBarrierNoSSE : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIERNoSSE, - [SDNPHasChain]>; def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, [SDNPHasChain]>; def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER, @@ -226,6 +225,10 @@ def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>; +def X86blsi_flag : SDNode<"X86ISD::BLSI", SDTUnaryArithWithFlags>; +def X86blsmsk_flag : SDNode<"X86ISD::BLSMSK", SDTUnaryArithWithFlags>; +def X86blsr_flag : SDNode<"X86ISD::BLSR", SDTUnaryArithWithFlags>; + def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void, @@ -237,6 +240,9 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL, + [SDNPHasChain, SDNPOutGlue]>; + //===----------------------------------------------------------------------===// // X86 Operand Definitions. // @@ -247,10 +253,31 @@ def ptr_rc_nosp : PointerLikeRegClass<1>; // *mem - Operand definitions for the funky X86 addressing mode operands. // -def X86MemAsmOperand : AsmOperandClass { - let Name = "Mem"; - let SuperClasses = []; +def X86MemAsmOperand : AsmOperandClass { + let Name = "Mem"; let PredicateMethod = "isMem"; +} +def X86Mem8AsmOperand : AsmOperandClass { + let Name = "Mem8"; let PredicateMethod = "isMem8"; +} +def X86Mem16AsmOperand : AsmOperandClass { + let Name = "Mem16"; let PredicateMethod = "isMem16"; +} +def X86Mem32AsmOperand : AsmOperandClass { + let Name = "Mem32"; let PredicateMethod = "isMem32"; +} +def X86Mem64AsmOperand : AsmOperandClass { + let Name = "Mem64"; let PredicateMethod = "isMem64"; +} +def X86Mem80AsmOperand : AsmOperandClass { + let Name = "Mem80"; let PredicateMethod = "isMem80"; +} +def X86Mem128AsmOperand : AsmOperandClass { + let Name = "Mem128"; let PredicateMethod = "isMem128"; } +def X86Mem256AsmOperand : AsmOperandClass { + let Name = "Mem256"; let PredicateMethod = "isMem256"; +} + def X86AbsMemAsmOperand : AsmOperandClass { let Name = "AbsMem"; let SuperClasses = [X86MemAsmOperand]; @@ -267,17 +294,28 @@ def opaque48mem : X86MemOperand<"printopaquemem">; def opaque80mem : X86MemOperand<"printopaquemem">; def opaque512mem : X86MemOperand<"printopaquemem">; -def i8mem : X86MemOperand<"printi8mem">; -def i16mem : X86MemOperand<"printi16mem">; -def i32mem : X86MemOperand<"printi32mem">; -def i64mem : X86MemOperand<"printi64mem">; -def i128mem : X86MemOperand<"printi128mem">; -def i256mem : X86MemOperand<"printi256mem">; -def f32mem : X86MemOperand<"printf32mem">; -def f64mem : X86MemOperand<"printf64mem">; -def f80mem : X86MemOperand<"printf80mem">; -def f128mem : X86MemOperand<"printf128mem">; -def f256mem : X86MemOperand<"printf256mem">; +def i8mem : X86MemOperand<"printi8mem"> { + let ParserMatchClass = X86Mem8AsmOperand; } +def i16mem : X86MemOperand<"printi16mem"> { + let ParserMatchClass = X86Mem16AsmOperand; } +def i32mem : X86MemOperand<"printi32mem"> { + let ParserMatchClass = X86Mem32AsmOperand; } +def i64mem : X86MemOperand<"printi64mem"> { + let ParserMatchClass = X86Mem64AsmOperand; } +def i128mem : X86MemOperand<"printi128mem"> { + let ParserMatchClass = X86Mem128AsmOperand; } +def i256mem : X86MemOperand<"printi256mem"> { + let ParserMatchClass = X86Mem256AsmOperand; } +def f32mem : X86MemOperand<"printf32mem"> { + let ParserMatchClass = X86Mem32AsmOperand; } +def f64mem : X86MemOperand<"printf64mem"> { + let ParserMatchClass = X86Mem64AsmOperand; } +def f80mem : X86MemOperand<"printf80mem"> { + let ParserMatchClass = X86Mem80AsmOperand; } +def f128mem : X86MemOperand<"printf128mem"> { + let ParserMatchClass = X86Mem128AsmOperand; } +def f256mem : X86MemOperand<"printf256mem">{ + let ParserMatchClass = X86Mem256AsmOperand; } } // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of @@ -285,7 +323,7 @@ def f256mem : X86MemOperand<"printf256mem">; def i8mem_NOREX : Operand<i64> { let PrintMethod = "printi8mem"; let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; + let ParserMatchClass = X86Mem8AsmOperand; let OperandType = "OPERAND_MEMORY"; } @@ -299,7 +337,7 @@ def ptr_rc_tailcall : PointerLikeRegClass<2>; def i32mem_TC : Operand<i32> { let PrintMethod = "printi32mem"; let MIOperandInfo = (ops GR32_TC, i8imm, GR32_TC, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; + let ParserMatchClass = X86Mem32AsmOperand; let OperandType = "OPERAND_MEMORY"; } @@ -310,7 +348,7 @@ def i64mem_TC : Operand<i64> { let PrintMethod = "printi64mem"; let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; + let ParserMatchClass = X86Mem64AsmOperand; let OperandType = "OPERAND_MEMORY"; } @@ -336,6 +374,11 @@ def SSECC : Operand<i8> { let OperandType = "OPERAND_IMMEDIATE"; } +def AVXCC : Operand<i8> { + let PrintMethod = "printSSECC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + class ImmSExtAsmOperandClass : AsmOperandClass { let SuperClasses = [ImmAsmOperand]; let RenderMethod = "addImmOperands"; @@ -466,37 +509,32 @@ def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; - def HasAVX : Predicate<"Subtarget->hasAVX()">; -def HasXMMInt : Predicate<"Subtarget->hasXMMInt()">; +def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; def HasCLMUL : Predicate<"Subtarget->hasCLMUL()">; def HasFMA3 : Predicate<"Subtarget->hasFMA3()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; +def HasXOP : Predicate<"Subtarget->hasXOP()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; def HasF16C : Predicate<"Subtarget->hasF16C()">; +def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; -def FPStackf32 : Predicate<"!Subtarget->hasXMM()">; -def FPStackf64 : Predicate<"!Subtarget->hasXMMInt()">; +def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; +def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; +def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def In32BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, AssemblerPredicate<"Mode64Bit">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; -def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; -def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">, - AssemblerPredicate<"ModeNaCl">; -def IsNaCl32 : Predicate<"Subtarget->isTargetNaCl32()">, - AssemblerPredicate<"ModeNaCl,!Mode64Bit">; -def IsNaCl64 : Predicate<"Subtarget->isTargetNaCl64()">, - AssemblerPredicate<"ModeNaCl,Mode64Bit">; -def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">, - AssemblerPredicate<"!ModeNaCl">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; +def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&" @@ -1375,7 +1413,7 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in { } //===----------------------------------------------------------------------===// -// TZCNT Instruction +// BMI Instructions // let Predicates = [HasBMI], Defs = [EFLAGS] in { def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), @@ -1405,6 +1443,83 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in { (implicit EFLAGS)]>, XS; } +multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, + RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, + PatFrag ld_frag> { + def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, EFLAGS, (OpNode RC:$src))]>, T8, VEX_4V; + def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, EFLAGS, (OpNode (ld_frag addr:$src)))]>, + T8, VEX_4V; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, + X86blsr_flag, loadi32>; + defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, + X86blsr_flag, loadi64>, VEX_W; + defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, + X86blsmsk_flag, loadi32>; + defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, + X86blsmsk_flag, loadi64>, VEX_W; + defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, + X86blsi_flag, loadi32>; + defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, + X86blsi_flag, loadi64>, VEX_W; +} + +multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag> { + def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>, + T8, VEX_4VOp3; + def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)), + (implicit EFLAGS)]>, T8, VEX_4VOp3; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem, + int_x86_bmi_bextr_32, loadi32>; + defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem, + int_x86_bmi_bextr_64, loadi64>, VEX_W; +} + +let Predicates = [HasBMI2], Defs = [EFLAGS] in { + defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem, + int_x86_bmi_bzhi_32, loadi32>; + defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem, + int_x86_bmi_bzhi_64, loadi64>, VEX_W; +} + +multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag> { + def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2))]>, + VEX_4V; + def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V; +} + +let Predicates = [HasBMI2] in { + defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, + int_x86_bmi_pdep_32, loadi32>, T8XD; + defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, + int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W; + defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, + int_x86_bmi_pext_32, loadi32>, T8XS; + defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, + int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W; +} + //===----------------------------------------------------------------------===// // Subsystems. //===----------------------------------------------------------------------===// @@ -1424,12 +1539,16 @@ include "X86InstrFragmentsSIMD.td" // FMA - Fused Multiply-Add support (requires FMA) include "X86InstrFMA.td" +// XOP +include "X86InstrXOP.td" + // SSE, MMX and 3DNow! vector support. include "X86InstrSSE.td" include "X86InstrMMX.td" include "X86Instr3DNow.td" include "X86InstrVMX.td" +include "X86InstrSVM.td" // System instructions. include "X86InstrSystem.td" @@ -1445,10 +1564,11 @@ def : MnemonicAlias<"call", "calll">, Requires<[In32BitMode]>; def : MnemonicAlias<"call", "callq">, Requires<[In64BitMode]>; def : MnemonicAlias<"cbw", "cbtw">; +def : MnemonicAlias<"cwde", "cwtl">; def : MnemonicAlias<"cwd", "cwtd">; def : MnemonicAlias<"cdq", "cltd">; -def : MnemonicAlias<"cwde", "cwtl">; def : MnemonicAlias<"cdqe", "cltq">; +def : MnemonicAlias<"cqo", "cqto">; // lret maps to lretl, it is not ambiguous with lretq. def : MnemonicAlias<"lret", "lretl">; @@ -1497,6 +1617,7 @@ def : MnemonicAlias<"verrw", "verr">; // System instruction aliases. def : MnemonicAlias<"iret", "iretl">; def : MnemonicAlias<"sysret", "sysretl">; +def : MnemonicAlias<"sysexit", "sysexitl">; def : MnemonicAlias<"lgdtl", "lgdt">, Requires<[In32BitMode]>; def : MnemonicAlias<"lgdtq", "lgdt">, Requires<[In64BitMode]>; @@ -1516,6 +1637,8 @@ def : MnemonicAlias<"fcmovna", "fcmovbe">; def : MnemonicAlias<"fcmovae", "fcmovnb">; def : MnemonicAlias<"fcomip", "fcompi">; def : MnemonicAlias<"fildq", "fildll">; +def : MnemonicAlias<"fistpq", "fistpll">; +def : MnemonicAlias<"fisttpq", "fisttpll">; def : MnemonicAlias<"fldcww", "fldcw">; def : MnemonicAlias<"fnstcww", "fnstcw">; def : MnemonicAlias<"fnstsww", "fnstsw">; @@ -1737,20 +1860,20 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; // errors, since its encoding is the most compact. def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>; -// shld/shrd op,op -> shld op, op, 1 -def : InstAlias<"shldw $r1, $r2", (SHLD16rri8 GR16:$r1, GR16:$r2, 1)>; -def : InstAlias<"shldl $r1, $r2", (SHLD32rri8 GR32:$r1, GR32:$r2, 1)>; -def : InstAlias<"shldq $r1, $r2", (SHLD64rri8 GR64:$r1, GR64:$r2, 1)>; -def : InstAlias<"shrdw $r1, $r2", (SHRD16rri8 GR16:$r1, GR16:$r2, 1)>; -def : InstAlias<"shrdl $r1, $r2", (SHRD32rri8 GR32:$r1, GR32:$r2, 1)>; -def : InstAlias<"shrdq $r1, $r2", (SHRD64rri8 GR64:$r1, GR64:$r2, 1)>; - -def : InstAlias<"shldw $mem, $reg", (SHLD16mri8 i16mem:$mem, GR16:$reg, 1)>; -def : InstAlias<"shldl $mem, $reg", (SHLD32mri8 i32mem:$mem, GR32:$reg, 1)>; -def : InstAlias<"shldq $mem, $reg", (SHLD64mri8 i64mem:$mem, GR64:$reg, 1)>; -def : InstAlias<"shrdw $mem, $reg", (SHRD16mri8 i16mem:$mem, GR16:$reg, 1)>; -def : InstAlias<"shrdl $mem, $reg", (SHRD32mri8 i32mem:$mem, GR32:$reg, 1)>; -def : InstAlias<"shrdq $mem, $reg", (SHRD64mri8 i64mem:$mem, GR64:$reg, 1)>; +// shld/shrd op,op -> shld op, op, CL +def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>; +def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>; +def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>; +def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>; +def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>; +def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>; + +def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>; +def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>; +def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>; +def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>; +def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>; +def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>; /* FIXME: This is disabled because the asm matcher is currently incapable of * matching a fixed immediate like $1. diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td index b2d9fca..63f96b6 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -1,4 +1,4 @@ -//====- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===// +//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -105,19 +105,23 @@ multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d> { def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [(set DstRC:$dst, (Int SrcRC:$src))], d>; + [(set DstRC:$dst, (Int SrcRC:$src))], + IIC_DEFAULT, d>; def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>; + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], + IIC_DEFAULT, d>; } multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d> { def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2), - asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>; + asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + IIC_DEFAULT, d>; def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src2), asm, - [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>; + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + IIC_DEFAULT, d>; } //===----------------------------------------------------------------------===// @@ -175,25 +179,25 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movq\t{$src, $dst|$dst, $src}", [(store (x86mmx VR64:$src), addr:$dst)]>; -def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src), - "movdq2q\t{$src, $dst|$dst, $src}", +def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (bitconvert (i64 (vector_extract (v2i64 VR128:$src), (iPTR 0))))))]>; -def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src), - "movq2dq\t{$src, $dst|$dst, $src}", +def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector (i64 (bitconvert (x86mmx VR64:$src))))))]>; let neverHasSideEffects = 1 in -def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src), - "movq2dq\t{$src, $dst|$dst, $src}", []>; +def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", []>; -def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins FR64:$src), - "movdq2q\t{$src, $dst|$dst, $src}", []>; +def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", []>; def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movntq\t{$src, $dst|$dst, $src}", diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index d3ced23..408ab167 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -1,4 +1,4 @@ -//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===// +//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -13,6 +13,126 @@ // //===----------------------------------------------------------------------===// +class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> { + InstrItinClass rr = arg_rr; + InstrItinClass rm = arg_rm; +} + +class SizeItins<OpndItins arg_s, OpndItins arg_d> { + OpndItins s = arg_s; + OpndItins d = arg_d; +} + + +class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, + InstrItinClass arg_ri> { + InstrItinClass rr = arg_rr; + InstrItinClass rm = arg_rm; + InstrItinClass ri = arg_ri; +} + + +// scalar +def SSE_ALU_F32S : OpndItins< + IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM +>; + +def SSE_ALU_F64S : OpndItins< + IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM +>; + +def SSE_ALU_ITINS_S : SizeItins< + SSE_ALU_F32S, SSE_ALU_F64S +>; + +def SSE_MUL_F32S : OpndItins< + IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM +>; + +def SSE_MUL_F64S : OpndItins< + IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM +>; + +def SSE_MUL_ITINS_S : SizeItins< + SSE_MUL_F32S, SSE_MUL_F64S +>; + +def SSE_DIV_F32S : OpndItins< + IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM +>; + +def SSE_DIV_F64S : OpndItins< + IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM +>; + +def SSE_DIV_ITINS_S : SizeItins< + SSE_DIV_F32S, SSE_DIV_F64S +>; + +// parallel +def SSE_ALU_F32P : OpndItins< + IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM +>; + +def SSE_ALU_F64P : OpndItins< + IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM +>; + +def SSE_ALU_ITINS_P : SizeItins< + SSE_ALU_F32P, SSE_ALU_F64P +>; + +def SSE_MUL_F32P : OpndItins< + IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM +>; + +def SSE_MUL_F64P : OpndItins< + IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM +>; + +def SSE_MUL_ITINS_P : SizeItins< + SSE_MUL_F32P, SSE_MUL_F64P +>; + +def SSE_DIV_F32P : OpndItins< + IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM +>; + +def SSE_DIV_F64P : OpndItins< + IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM +>; + +def SSE_DIV_ITINS_P : SizeItins< + SSE_DIV_F32P, SSE_DIV_F64P +>; + +def SSE_BIT_ITINS_P : OpndItins< + IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM +>; + +def SSE_INTALU_ITINS_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +def SSE_INTALUQ_ITINS_P : OpndItins< + IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM +>; + +def SSE_INTMUL_ITINS_P : OpndItins< + IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM +>; + +def SSE_INTSHIFT_ITINS_P : ShiftOpndItins< + IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI +>; + +def SSE_MOVA_ITINS : OpndItins< + IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM +>; + +def SSE_MOVU_ITINS : OpndItins< + IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM +>; //===----------------------------------------------------------------------===// // SSE 1 & 2 Instructions Classes @@ -21,25 +141,27 @@ /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, + OpndItins itins, bit Is2Addr = 1> { let isCommutable = 1 in { def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>; + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>; } def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))]>; + [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>; } /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, string asm, string SSEVer, string FPSizeStr, Operand memopr, ComplexPattern mem_cpat, + OpndItins itins, bit Is2Addr = 1> { def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, @@ -47,72 +169,74 @@ multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (!cast<Intrinsic>( !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, RC:$src2))]>; + RC:$src1, RC:$src2))], itins.rr>; def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, mem_cpat:$src2))]>; + RC:$src1, mem_cpat:$src2))], itins.rm>; } /// sse12_fp_packed - SSE 1 & 2 packed instructions class multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, RegisterClass RC, ValueType vt, X86MemOperand x86memop, PatFrag mem_frag, - Domain d, bit Is2Addr = 1> { + Domain d, OpndItins itins, bit Is2Addr = 1> { let isCommutable = 1 in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>; let mayLoad = 1 in def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], d>; + [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], + itins.rm, d>; } /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, string OpcodeStr, X86MemOperand x86memop, list<dag> pat_rr, list<dag> pat_rm, - bit Is2Addr = 1> { - let isCommutable = 1 in + bit Is2Addr = 1, + bit rr_hasSideEffects = 0> { + let isCommutable = 1, neverHasSideEffects = rr_hasSideEffects in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - pat_rr, d>; + pat_rr, IIC_DEFAULT, d>; def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - pat_rm, d>; + pat_rm, IIC_DEFAULT, d>; } /// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC, string asm, string SSEVer, string FPSizeStr, X86MemOperand x86memop, PatFrag mem_frag, - Domain d, bit Is2Addr = 1> { + Domain d, OpndItins itins, bit Is2Addr = 1> { def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (!cast<Intrinsic>( !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, RC:$src2))], d>; + RC:$src1, RC:$src2))], IIC_DEFAULT, d>; def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (!cast<Intrinsic>( !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, (mem_frag addr:$src2)))], d>; + RC:$src1, (mem_frag addr:$src2)))], IIC_DEFAULT, d>; } //===----------------------------------------------------------------------===// @@ -170,7 +294,7 @@ def : Pat<(v4f64 (scalar_to_vector FR64:$src)), // Bitcasts between 128-bit vector types. Return the original type since // no instruction is needed for the conversion -let Predicates = [HasXMMInt] in { +let Predicates = [HasSSE2] in { def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; @@ -239,21 +363,13 @@ let Predicates = [HasAVX] in { } // Alias instructions that map fld0 to pxor for sse. -// FIXME: Set encoding to pseudo! -let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1, - canFoldAsLoad = 1 in { - def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "", - [(set FR32:$dst, fp32imm0)]>, - Requires<[HasSSE1]>, TB, OpSize; - def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, - Requires<[HasSSE2]>, TB, OpSize; - def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "", - [(set FR32:$dst, fp32imm0)]>, - Requires<[HasAVX]>, TB, OpSize, VEX_4V; - def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, - Requires<[HasAVX]>, TB, OpSize, VEX_4V; +// This is expanded by ExpandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1 in { + def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", + [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>; + def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", + [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>; } //===----------------------------------------------------------------------===// @@ -286,16 +402,35 @@ def : Pat<(v16i8 immAllZerosV), (V_SET0)>; // JIT implementatioan, it does not expand the instructions below like // X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1, Predicates = [HasAVX] in { + isCodeGenOnly = 1 in { +let Predicates = [HasAVX] in { def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V; def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V; } +let Predicates = [HasAVX2], neverHasSideEffects = 1 in +def AVX2_SET0 : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "", + []>, VEX_4V; +} +let Predicates = [HasAVX2], AddedComplexity = 5 in { + def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>; + def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>; + def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>; + def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>; +} // AVX has no support for 256-bit integer instructions, but since the 128-bit // VPXOR instruction writes zero to its upper part, it's safe build zeros. +def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v32i8 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; + +def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v16i16 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; + def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; @@ -310,13 +445,16 @@ def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), // JIT implementation, it does not expand the instructions below like // X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1, ExeDomain = SSEPackedInt in - def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4i32 immAllOnesV))]>; -let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in + isCodeGenOnly = 1, ExeDomain = SSEPackedInt in { + let Predicates = [HasAVX] in def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V; + def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>; + let Predicates = [HasAVX2] in + def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V; +} //===----------------------------------------------------------------------===// @@ -329,22 +467,25 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // in terms of a copy, and just mentioned, we don't use movss/movsd for copies. //===----------------------------------------------------------------------===// -class sse12_move_rr<RegisterClass RC, ValueType vt, string asm> : +class sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, string asm> : SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm, - [(set (vt VR128:$dst), (movl VR128:$src1, (scalar_to_vector RC:$src2)))]>; + [(set VR128:$dst, (vt (OpNode VR128:$src1, + (scalar_to_vector RC:$src2))))], + IIC_SSE_MOV_S_RR>; // Loading from memory automatically zeroing upper bits. class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, PatFrag mem_pat, string OpcodeStr> : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (mem_pat addr:$src))]>; + [(set RC:$dst, (mem_pat addr:$src))], + IIC_SSE_MOV_S_RM>; // AVX -def VMOVSSrr : sse12_move_rr<FR32, v4f32, +def VMOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32, "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V, VEX_LIG; -def VMOVSDrr : sse12_move_rr<FR64, v2f64, +def VMOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64, "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V, VEX_LIG; @@ -352,11 +493,13 @@ def VMOVSDrr : sse12_move_rr<FR64, v2f64, let isCodeGenOnly = 1 in { def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, FR32:$src2), - "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_MOV_S_RR>, XS, VEX_4V, VEX_LIG; def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, FR64:$src2), - "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_MOV_S_RR>, XD, VEX_4V, VEX_LIG; } @@ -370,26 +513,30 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), "movss\t{$src, $dst|$dst, $src}", - [(store FR32:$src, addr:$dst)]>, XS, VEX, VEX_LIG; + [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + XS, VEX, VEX_LIG; def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", - [(store FR64:$src, addr:$dst)]>, XD, VEX, VEX_LIG; + [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + XD, VEX, VEX_LIG; // SSE1 & 2 let Constraints = "$src1 = $dst" in { - def MOVSSrr : sse12_move_rr<FR32, v4f32, + def MOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32, "movss\t{$src2, $dst|$dst, $src2}">, XS; - def MOVSDrr : sse12_move_rr<FR64, v2f64, + def MOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64, "movsd\t{$src2, $dst|$dst, $src2}">, XD; // For the disassembler let isCodeGenOnly = 1 in { def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, FR32:$src2), - "movss\t{$src2, $dst|$dst, $src2}", []>, XS; + "movss\t{$src2, $dst|$dst, $src2}", [], + IIC_SSE_MOV_S_RR>, XS; def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, FR64:$src2), - "movsd\t{$src2, $dst|$dst, $src2}", []>, XD; + "movsd\t{$src2, $dst|$dst, $src2}", [], + IIC_SSE_MOV_S_RR>, XD; } } @@ -402,153 +549,14 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), "movss\t{$src, $dst|$dst, $src}", - [(store FR32:$src, addr:$dst)]>; + [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>; def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", - [(store FR64:$src, addr:$dst)]>; + [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>; // Patterns -let Predicates = [HasSSE1] in { - let AddedComplexity = 15 in { - // Extract the low 32-bit value from one vector and insert it into another. - def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)), - (MOVSSrr (v4f32 VR128:$src1), - (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; - def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)), - (MOVSSrr (v4i32 VR128:$src1), - (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; - - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVSS to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (MOVSSrr (v4f32 (V_SET0)), - (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; - } - - let AddedComplexity = 20 in { - // MOVSSrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; - def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), - (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; - def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), - (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; - } - - // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), - addr:$dst), - (MOVSSmr addr:$dst, - (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; - - // Shuffle with MOVSS - def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), - (MOVSSrr VR128:$src1, FR32:$src2)>; - def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (MOVSSrr (v4i32 VR128:$src1), - (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; - def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), - (MOVSSrr (v4f32 VR128:$src1), - (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; -} - -let Predicates = [HasSSE2] in { - let AddedComplexity = 15 in { - // Extract the low 64-bit value from one vector and insert it into another. - def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)), - (MOVSDrr (v2f64 VR128:$src1), - (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; - def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)), - (MOVSDrr (v2i64 VR128:$src1), - (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; - - // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd - def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; - def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; - - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVSD to the lower bits. - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; - } - - let AddedComplexity = 20 in { - // MOVSDrm zeros the high parts of the register; represent this - // with SUBREG_TO_REG. - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; - def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), - (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; - def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), - (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; - def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), - (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; - def : Pat<(v2f64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; - } - - // Extract and store. - def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), - addr:$dst), - (MOVSDmr addr:$dst, - (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; - - // Shuffle with MOVSD - def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), - (MOVSDrr VR128:$src1, FR64:$src2)>; - def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr (v2i64 VR128:$src1), - (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; - def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr (v2f64 VR128:$src1), - (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; - def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; - def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>; - - // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem - // is during lowering, where it's not possible to recognize the fold cause - // it has two uses through a bitcast. One use disappears at isel time and the - // fold opportunity reappears. - def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; - def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>; -} - let Predicates = [HasAVX] in { let AddedComplexity = 15 in { - // Extract the low 32-bit value from one vector and insert it into another. - def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)), - (VMOVSSrr (v4f32 VR128:$src1), - (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; - def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)), - (VMOVSSrr (v4i32 VR128:$src1), - (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; - - // Extract the low 64-bit value from one vector and insert it into another. - def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)), - (VMOVSDrr (v2f64 VR128:$src1), - (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; - def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)), - (VMOVSDrr (v2i64 VR128:$src1), - (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; - - // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd - def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; - def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; - // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVS{S,D} to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), @@ -561,6 +569,16 @@ let Predicates = [HasAVX] in { (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v8f32 VR256:$src), sub_ss)), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v8i32 VR256:$src), sub_ss)), sub_xmm)>; } let AddedComplexity = 20 in { @@ -588,6 +606,9 @@ let Predicates = [HasAVX] in { // Represent the same patterns above but in the form they appear for // 256-bit types + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; @@ -605,6 +626,20 @@ let Predicates = [HasAVX] in { (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v4f64 VR256:$src), sub_sd)), sub_xmm)>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (v2i64 (V_SET0)), + (EXTRACT_SUBREG (v4i64 VR256:$src), sub_sd)), sub_xmm)>; // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), @@ -617,8 +652,6 @@ let Predicates = [HasAVX] in { (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; // Shuffle with VMOVSS - def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), - (VMOVSSrr VR128:$src1, FR32:$src2)>; def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), (VMOVSSrr (v4i32 VR128:$src1), (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; @@ -626,9 +659,17 @@ let Predicates = [HasAVX] in { (VMOVSSrr (v4f32 VR128:$src1), (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; + // 256-bit variants + def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_ss), + (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_ss)), sub_xmm)>; + def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_ss), + (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_ss)), sub_xmm)>; + // Shuffle with VMOVSD - def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), - (VMOVSDrr VR128:$src1, FR64:$src2)>; def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), (VMOVSDrr (v2i64 VR128:$src1), (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; @@ -642,10 +683,27 @@ let Predicates = [HasAVX] in { (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; + // 256-bit variants + def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_sd), + (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_sd)), sub_xmm)>; + def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_sd), + (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_sd)), sub_xmm)>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. + def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2), + sub_sd))>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2), + sub_sd))>; def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>; @@ -654,6 +712,101 @@ let Predicates = [HasAVX] in { sub_sd))>; } +let Predicates = [HasSSE1] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0)), + (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; + } + + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; + } + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSSmr addr:$dst, + (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + + // Shuffle with MOVSS + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; +} + +let Predicates = [HasSSE2] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSD to the lower bits. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + } + + let AddedComplexity = 20 in { + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + } + + // Extract and store. + def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSDmr addr:$dst, + (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; + + // Shuffle with MOVSD + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),sub_sd))>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),sub_sd))>; + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions //===----------------------------------------------------------------------===// @@ -661,126 +814,176 @@ let Predicates = [HasAVX] in { multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d, + OpndItins itins, bit IsReMaterializable = 1> { let neverHasSideEffects = 1 in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>; + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>; let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (ld_frag addr:$src))], d>; + [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>; } defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, - "movaps", SSEPackedSingle>, TB, VEX; + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + TB, VEX; defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, - "movapd", SSEPackedDouble>, TB, OpSize, VEX; + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + TB, OpSize, VEX; defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, - "movups", SSEPackedSingle>, TB, VEX; + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + TB, VEX; defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, - "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX; + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + TB, OpSize, VEX; defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, - "movaps", SSEPackedSingle>, TB, VEX; + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + TB, VEX; defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, - "movapd", SSEPackedDouble>, TB, OpSize, VEX; + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + TB, OpSize, VEX; defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, - "movups", SSEPackedSingle>, TB, VEX; + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + TB, VEX; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, - "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX; + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + TB, OpSize, VEX; defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, - "movaps", SSEPackedSingle>, TB; + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + TB; defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, - "movapd", SSEPackedDouble>, TB, OpSize; + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + TB, OpSize; defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, - "movups", SSEPackedSingle>, TB; + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + TB; defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, - "movupd", SSEPackedDouble, 0>, TB, OpSize; + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + TB, OpSize; def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", - [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX; + [(alignedstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX; def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movapd\t{$src, $dst|$dst, $src}", - [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, VEX; + [(alignedstore (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX; def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movups\t{$src, $dst|$dst, $src}", - [(store (v4f32 VR128:$src), addr:$dst)]>, VEX; + [(store (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX; def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movupd\t{$src, $dst|$dst, $src}", - [(store (v2f64 VR128:$src), addr:$dst)]>, VEX; + [(store (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX; def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movaps\t{$src, $dst|$dst, $src}", - [(alignedstore256 (v8f32 VR256:$src), addr:$dst)]>, VEX; + [(alignedstore256 (v8f32 VR256:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX; def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movapd\t{$src, $dst|$dst, $src}", - [(alignedstore256 (v4f64 VR256:$src), addr:$dst)]>, VEX; + [(alignedstore256 (v4f64 VR256:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX; def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movups\t{$src, $dst|$dst, $src}", - [(store (v8f32 VR256:$src), addr:$dst)]>, VEX; + [(store (v8f32 VR256:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX; def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movupd\t{$src, $dst|$dst, $src}", - [(store (v4f64 VR256:$src), addr:$dst)]>, VEX; + [(store (v4f64 VR256:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX; // For disassembler let isCodeGenOnly = 1 in { def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movaps\t{$src, $dst|$dst, $src}", []>, VEX; + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movapd\t{$src, $dst|$dst, $src}", []>, VEX; + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movups\t{$src, $dst|$dst, $src}", []>, VEX; + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX; def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movupd\t{$src, $dst|$dst, $src}", []>, VEX; + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX; def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), - "movaps\t{$src, $dst|$dst, $src}", []>, VEX; + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), - "movapd\t{$src, $dst|$dst, $src}", []>, VEX; + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), - "movups\t{$src, $dst|$dst, $src}", []>, VEX; + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX; def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), - "movupd\t{$src, $dst|$dst, $src}", []>, VEX; + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX; +} + +let Predicates = [HasAVX] in { +def : Pat<(v8i32 (X86vzmovl + (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v4i64 (X86vzmovl + (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v8f32 (X86vzmovl + (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; +def : Pat<(v4f64 (X86vzmovl + (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; } -def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>; + def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), (VMOVUPSYmr addr:$dst, VR256:$src)>; - -def : Pat<(int_x86_avx_loadu_pd_256 addr:$src), (VMOVUPDYrm addr:$src)>; def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), (VMOVUPDYmr addr:$dst, VR256:$src)>; def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", - [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; + [(alignedstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>; def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movapd\t{$src, $dst|$dst, $src}", - [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; + [(alignedstore (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>; def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movups\t{$src, $dst|$dst, $src}", - [(store (v4f32 VR128:$src), addr:$dst)]>; + [(store (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>; def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movupd\t{$src, $dst|$dst, $src}", - [(store (v2f64 VR128:$src), addr:$dst)]>; + [(store (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>; // For disassembler let isCodeGenOnly = 1 in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movaps\t{$src, $dst|$dst, $src}", []>; + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movapd\t{$src, $dst|$dst, $src}", []>; + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movups\t{$src, $dst|$dst, $src}", []>; + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>; def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movupd\t{$src, $dst|$dst, $src}", []>; + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>; } let Predicates = [HasAVX] in { @@ -797,44 +1000,9 @@ let Predicates = [HasSSE2] in def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), (MOVUPDmr addr:$dst, VR128:$src)>; -// Use movaps / movups for SSE integer load / store (one byte shorter). -// The instructions selected below are then converted to MOVDQA/MOVDQU -// during the SSE domain pass. -let Predicates = [HasSSE1] in { - def : Pat<(alignedloadv4i32 addr:$src), - (MOVAPSrm addr:$src)>; - def : Pat<(loadv4i32 addr:$src), - (MOVUPSrm addr:$src)>; - def : Pat<(alignedloadv2i64 addr:$src), - (MOVAPSrm addr:$src)>; - def : Pat<(loadv2i64 addr:$src), - (MOVUPSrm addr:$src)>; - - def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), - (MOVAPSmr addr:$dst, VR128:$src)>; - def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), - (MOVAPSmr addr:$dst, VR128:$src)>; - def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), - (MOVAPSmr addr:$dst, VR128:$src)>; - def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), - (MOVAPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (v2i64 VR128:$src), addr:$dst), - (MOVUPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (v4i32 VR128:$src), addr:$dst), - (MOVUPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (v8i16 VR128:$src), addr:$dst), - (MOVUPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (v16i8 VR128:$src), addr:$dst), - (MOVUPSmr addr:$dst, VR128:$src)>; -} - // Use vmovaps/vmovups for AVX integer load/store. let Predicates = [HasAVX] in { // 128-bit load/store - def : Pat<(alignedloadv4i32 addr:$src), - (VMOVAPSrm addr:$src)>; - def : Pat<(loadv4i32 addr:$src), - (VMOVUPSrm addr:$src)>; def : Pat<(alignedloadv2i64 addr:$src), (VMOVAPSrm addr:$src)>; def : Pat<(loadv2i64 addr:$src), @@ -862,10 +1030,6 @@ let Predicates = [HasAVX] in { (VMOVAPSYrm addr:$src)>; def : Pat<(loadv4i64 addr:$src), (VMOVUPSYrm addr:$src)>; - def : Pat<(alignedloadv8i32 addr:$src), - (VMOVAPSYrm addr:$src)>; - def : Pat<(loadv8i32 addr:$src), - (VMOVUPSYrm addr:$src)>; def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst), @@ -884,36 +1048,71 @@ let Predicates = [HasAVX] in { (VMOVUPSYmr addr:$dst, VR256:$src)>; } +// Use movaps / movups for SSE integer load / store (one byte shorter). +// The instructions selected below are then converted to MOVDQA/MOVDQU +// during the SSE domain pass. +let Predicates = [HasSSE1] in { + def : Pat<(alignedloadv2i64 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv2i64 addr:$src), + (MOVUPSrm addr:$src)>; + + def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v2i64 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; +} + // Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper // bits are disregarded. FIXME: Set encoding to pseudo! let neverHasSideEffects = 1 in { -def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), - "movaps\t{$src, $dst|$dst, $src}", []>; -def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), - "movapd\t{$src, $dst|$dst, $src}", []>; def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), - "movaps\t{$src, $dst|$dst, $src}", []>, VEX; + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), - "movapd\t{$src, $dst|$dst, $src}", []>, VEX; + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; +def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; +def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; } // Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper // bits are disregarded. FIXME: Set encoding to pseudo! let canFoldAsLoad = 1, isReMaterializable = 1 in { -def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), - "movaps\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>; -def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), - "movapd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>; let isCodeGenOnly = 1 in { def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), "movaps\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>, VEX; + [(set FR32:$dst, (alignedloadfsf32 addr:$src))], + IIC_SSE_MOVA_P_RM>, VEX; def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), "movapd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>, VEX; + [(set FR64:$dst, (alignedloadfsf64 addr:$src))], + IIC_SSE_MOVA_P_RM>, VEX; } +def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (alignedloadfsf32 addr:$src))], + IIC_SSE_MOVA_P_RM>; +def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (alignedloadfsf64 addr:$src))], + IIC_SSE_MOVA_P_RM>; } //===----------------------------------------------------------------------===// @@ -921,94 +1120,68 @@ let isCodeGenOnly = 1 in { //===----------------------------------------------------------------------===// multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC, - PatFrag mov_frag, string base_opc, - string asm_opr> { + SDNode psnode, SDNode pdnode, string base_opc, + string asm_opr, InstrItinClass itin> { def PSrm : PI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), !strconcat(base_opc, "s", asm_opr), [(set RC:$dst, - (mov_frag RC:$src1, + (psnode RC:$src1, (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))], - SSEPackedSingle>, TB; + itin, SSEPackedSingle>, TB; def PDrm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, f64mem:$src2), !strconcat(base_opc, "d", asm_opr), - [(set RC:$dst, (v2f64 (mov_frag RC:$src1, + [(set RC:$dst, (v2f64 (pdnode RC:$src1, (scalar_to_vector (loadf64 addr:$src2)))))], - SSEPackedDouble>, TB, OpSize; + itin, SSEPackedDouble>, TB, OpSize; } let AddedComplexity = 20 in { - defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp", - "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V; + defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp", + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + IIC_SSE_MOV_LH>, VEX_4V; } let Constraints = "$src1 = $dst", AddedComplexity = 20 in { - defm MOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp", - "\t{$src2, $dst|$dst, $src2}">; + defm MOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp", + "\t{$src2, $dst|$dst, $src2}", + IIC_SSE_MOV_LH>; } def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), - (iPTR 0))), addr:$dst)]>, VEX; + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, VEX; def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (v2f64 VR128:$src), - (iPTR 0))), addr:$dst)]>, VEX; + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, VEX; def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), - (iPTR 0))), addr:$dst)]>; + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>; def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (v2f64 VR128:$src), - (iPTR 0))), addr:$dst)]>; + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>; let Predicates = [HasAVX] in { - let AddedComplexity = 20 in { - // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS - def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))), - (VMOVLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))), - (VMOVLPSrm VR128:$src1, addr:$src2)>; - // vector_shuffle v1, (load v2) <2, 1> using MOVLPS - def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))), - (VMOVLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))), - (VMOVLPDrm VR128:$src1, addr:$src2)>; - } - - // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS - def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), - (VMOVLPSmr addr:$src1, VR128:$src2)>; - def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), - VR128:$src2)), addr:$src1), - (VMOVLPSmr addr:$src1, VR128:$src2)>; - - // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS - def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), - (VMOVLPDmr addr:$src1, VR128:$src2)>; - def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), - (VMOVLPDmr addr:$src1, VR128:$src2)>; - // Shuffle with VMOVLPS def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), (VMOVLPSrm VR128:$src1, addr:$src2)>; def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), (VMOVLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(X86Movlps VR128:$src1, - (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), - (VMOVLPSrm VR128:$src1, addr:$src2)>; // Shuffle with VMOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Movlpd VR128:$src1, - (scalar_to_vector (loadf64 addr:$src2)))), - (VMOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), @@ -1026,19 +1199,9 @@ let Predicates = [HasAVX] in { } let Predicates = [HasSSE1] in { - let AddedComplexity = 20 in { - // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS - def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))), - (MOVLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))), - (MOVLPSrm VR128:$src1, addr:$src2)>; - } - // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS - def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), - (MOVLPSmr addr:$src1, VR128:$src2)>; - def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), - VR128:$src2)), addr:$src1), + def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)), + (iPTR 0))), addr:$src1), (MOVLPSmr addr:$src1, VR128:$src2)>; // Shuffle with MOVLPS @@ -1047,7 +1210,7 @@ let Predicates = [HasSSE1] in { def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), (MOVLPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlps VR128:$src1, - (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), (MOVLPSrm VR128:$src1, addr:$src2)>; // Store patterns @@ -1061,28 +1224,11 @@ let Predicates = [HasSSE1] in { } let Predicates = [HasSSE2] in { - let AddedComplexity = 20 in { - // vector_shuffle v1, (load v2) <2, 1> using MOVLPS - def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))), - (MOVLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))), - (MOVLPDrm VR128:$src1, addr:$src2)>; - } - - // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS - def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), - (MOVLPDmr addr:$src1, VR128:$src2)>; - def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), - (MOVLPDmr addr:$src1, VR128:$src2)>; - // Shuffle with MOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Movlpd VR128:$src1, - (scalar_to_vector (loadf64 addr:$src2)))), - (MOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), @@ -1098,12 +1244,14 @@ let Predicates = [HasSSE2] in { //===----------------------------------------------------------------------===// let AddedComplexity = 20 in { - defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp", - "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V; + defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp", + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + IIC_SSE_MOV_LH>, VEX_4V; } let Constraints = "$src1 = $dst", AddedComplexity = 20 in { - defm MOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp", - "\t{$src2, $dst|$dst, $src2}">; + defm MOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp", + "\t{$src2, $dst|$dst, $src2}", + IIC_SSE_MOV_LH>; } // v2f64 extract element 1 is always custom lowered to unpack high to low @@ -1111,94 +1259,62 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract - (unpckh (bc_v2f64 (v4f32 VR128:$src)), - (undef)), (iPTR 0))), addr:$dst)]>, - VEX; + (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), + (bc_v2f64 (v4f32 VR128:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract - (v2f64 (unpckh VR128:$src, (undef))), - (iPTR 0))), addr:$dst)]>, - VEX; + (v2f64 (X86Unpckh VR128:$src, VR128:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract - (unpckh (bc_v2f64 (v4f32 VR128:$src)), - (undef)), (iPTR 0))), addr:$dst)]>; + (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), + (bc_v2f64 (v4f32 VR128:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract - (v2f64 (unpckh VR128:$src, (undef))), - (iPTR 0))), addr:$dst)]>; + (v2f64 (X86Unpckh VR128:$src, VR128:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; let Predicates = [HasAVX] in { // VMOVHPS patterns - def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), - (VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, - (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), (VMOVHPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), (VMOVHPSrm VR128:$src1, addr:$src2)>; - // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem - // is during lowering, where it's not possible to recognize the load fold cause - // it has two uses through a bitcast. One use disappears at isel time and the - // fold opportunity reappears. - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem + // is during lowering, where it's not possible to recognize the load fold + // cause it has two uses through a bitcast. One use disappears at isel time + // and the fold opportunity reappears. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; - - // FIXME: This should be matched by a X86Movhpd instead. Same as above - def : Pat<(v2f64 (X86Movlhpd VR128:$src1, - (scalar_to_vector (loadf64 addr:$src2)))), - (VMOVHPDrm VR128:$src1, addr:$src2)>; - - // Store patterns - def : Pat<(store (f64 (vector_extract - (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst), - (VMOVHPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (f64 (vector_extract - (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))), addr:$dst), - (VMOVHPDmr addr:$dst, VR128:$src)>; } let Predicates = [HasSSE1] in { // MOVHPS patterns - def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), - (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, - (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), (MOVHPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), (MOVHPSrm VR128:$src1, addr:$src2)>; - - // Store patterns - def : Pat<(store (f64 (vector_extract - (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst), - (MOVHPSmr addr:$dst, VR128:$src)>; } let Predicates = [HasSSE2] in { - // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem - // is during lowering, where it's not possible to recognize the load fold cause - // it has two uses through a bitcast. One use disappears at isel time and the - // fold opportunity reappears. - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem + // is during lowering, where it's not possible to recognize the load fold + // cause it has two uses through a bitcast. One use disappears at isel time + // and the fold opportunity reappears. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; - - // FIXME: This should be matched by a X86Movhpd instead. Same as above - def : Pat<(v2f64 (X86Movlhpd VR128:$src1, - (scalar_to_vector (loadf64 addr:$src2)))), - (MOVHPDrm VR128:$src1, addr:$src2)>; - - // Store patterns - def : Pat<(store (f64 (vector_extract - (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst), - (MOVHPDmr addr:$dst, VR128:$src)>; } //===----------------------------------------------------------------------===// @@ -1210,13 +1326,15 @@ let AddedComplexity = 20 in { (ins VR128:$src1, VR128:$src2), "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>, + (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, VEX_4V; def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>, + (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, VEX_4V; } let Constraints = "$src1 = $dst", AddedComplexity = 20 in { @@ -1224,86 +1342,36 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { (ins VR128:$src1, VR128:$src2), "movlhps\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>; + (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>; def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movhlps\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>; + (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>; } let Predicates = [HasAVX] in { // MOVLHPS patterns - let AddedComplexity = 20 in { - def : Pat<(v4f32 (movddup VR128:$src, (undef))), - (VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>; - def : Pat<(v2i64 (movddup VR128:$src, (undef))), - (VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>; - - // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS - def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)), - (VMOVLHPSrr VR128:$src1, VR128:$src2)>; - } - def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)), - (VMOVLHPSrr VR128:$src1, VR128:$src2)>; def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), (VMOVLHPSrr VR128:$src1, VR128:$src2)>; def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; // MOVHLPS patterns - let AddedComplexity = 20 in { - // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS - def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)), - (VMOVHLPSrr VR128:$src1, VR128:$src2)>; - - // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS - def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))), - (VMOVHLPSrr VR128:$src1, VR128:$src1)>; - def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))), - (VMOVHLPSrr VR128:$src1, VR128:$src1)>; - } - - def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)), - (VMOVHLPSrr VR128:$src1, VR128:$src2)>; def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), (VMOVHLPSrr VR128:$src1, VR128:$src2)>; } let Predicates = [HasSSE1] in { // MOVLHPS patterns - let AddedComplexity = 20 in { - def : Pat<(v4f32 (movddup VR128:$src, (undef))), - (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>; - def : Pat<(v2i64 (movddup VR128:$src, (undef))), - (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>; - - // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS - def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)), - (MOVLHPSrr VR128:$src1, VR128:$src2)>; - } - def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)), - (MOVLHPSrr VR128:$src1, VR128:$src2)>; def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), (MOVLHPSrr VR128:$src1, VR128:$src2)>; def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; // MOVHLPS patterns - let AddedComplexity = 20 in { - // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS - def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)), - (MOVHLPSrr VR128:$src1, VR128:$src2)>; - - // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS - def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))), - (MOVHLPSrr VR128:$src1, VR128:$src1)>; - def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))), - (MOVHLPSrr VR128:$src1, VR128:$src1)>; - } - - def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)), - (MOVHLPSrr VR128:$src1, VR128:$src2)>; def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), (MOVHLPSrr VR128:$src1, VR128:$src2)>; } @@ -1312,70 +1380,97 @@ let Predicates = [HasSSE1] in { // SSE 1 & 2 - Conversion Instructions //===----------------------------------------------------------------------===// +def SSE_CVT_PD : OpndItins< + IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM +>; + +def SSE_CVT_PS : OpndItins< + IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM +>; + +def SSE_CVT_Scalar : OpndItins< + IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM +>; + +def SSE_CVT_SS2SI_32 : OpndItins< + IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM +>; + +def SSE_CVT_SS2SI_64 : OpndItins< + IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM +>; + +def SSE_CVT_SD2SI : OpndItins< + IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM +>; + multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, - string asm> { + string asm, OpndItins itins> { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [(set DstRC:$dst, (OpNode SrcRC:$src))]>; + [(set DstRC:$dst, (OpNode SrcRC:$src))], + itins.rr>; def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>; -} - -multiclass sse12_cvt_s_np<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - X86MemOperand x86memop, string asm> { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, []>; - let mayLoad = 1 in - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, []>; + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], + itins.rm>; } multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, - string asm, Domain d> { + string asm, Domain d, OpndItins itins> { def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [(set DstRC:$dst, (OpNode SrcRC:$src))], d>; + [(set DstRC:$dst, (OpNode SrcRC:$src))], + itins.rr, d>; def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], d>; + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], + itins.rm, d>; } multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { +let neverHasSideEffects = 1 in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; +} // neverHasSideEffects = 1 } defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX, - VEX_LIG; + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_32>, + XS, VEX, VEX_LIG; defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX, - VEX_W, VEX_LIG; + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_64>, + XS, VEX, VEX_W, VEX_LIG; defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, - VEX_LIG; + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, + XD, VEX, VEX_LIG; defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, - VEX, VEX_W, VEX_LIG; + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, + XD, VEX, VEX_W, VEX_LIG; // The assembler can recognize rr 64-bit instructions by seeing a rxx // register, but the same isn't true when only using memory operands, // provide other assembly "l" and "q" forms to address this explicitly // where appropriate to do so. -defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, XS, - VEX_4V, VEX_LIG; -defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, XS, - VEX_4V, VEX_W, VEX_LIG; -defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, XD, - VEX_4V, VEX_LIG; -defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD, - VEX_4V, VEX_LIG; -defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD, - VEX_4V, VEX_W, VEX_LIG; - -let Predicates = [HasAVX] in { +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, + XS, VEX_4V, VEX_LIG; +defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, + XS, VEX_4V, VEX_W, VEX_LIG; +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, + XD, VEX_4V, VEX_LIG; +defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, + XD, VEX_4V, VEX_LIG; +defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, + XD, VEX_4V, VEX_W, VEX_LIG; + +let Predicates = [HasAVX], AddedComplexity = 1 in { def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), @@ -1396,169 +1491,185 @@ let Predicates = [HasAVX] in { } defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}">, XS; + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_32>, XS; defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, - "cvttss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W; + "cvttss2si{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_64>, XS, REX_W; defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}">, XD; + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, XD; defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, - "cvttsd2si{q}\t{$src, $dst|$dst, $src}">, XD, REX_W; + "cvttsd2si{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, XD, REX_W; defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, - "cvtsi2ss\t{$src, $dst|$dst, $src}">, XS; + "cvtsi2ss\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XS; defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, - "cvtsi2ss{q}\t{$src, $dst|$dst, $src}">, XS, REX_W; + "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XS, REX_W; defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, - "cvtsi2sd\t{$src, $dst|$dst, $src}">, XD; + "cvtsi2sd\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XD; defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, - "cvtsi2sd{q}\t{$src, $dst|$dst, $src}">, XD, REX_W; + "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XD, REX_W; // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, - string asm> { + string asm, OpndItins itins> { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int SrcRC:$src))]>; + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>; def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int (ld_frag addr:$src)))]>; + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm>; } multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, - PatFrag ld_frag, string asm, bit Is2Addr = 1> { + PatFrag ld_frag, string asm, OpndItins itins, + bit Is2Addr = 1> { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))]>; + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + itins.rr>; def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>; + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + itins.rm>; } -defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, - f128mem, load, "cvtsd2si">, XD, VEX; -defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">, - XD, VEX, VEX_W; - -// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ -// Get rid of this hack or rename the intrinsics, there are several -// intructions that only match with the intrinsic form, why create duplicates -// to let them be recognized by the assembler? -defm VCVTSD2SI : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, - "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_LIG; -defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, - "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W, - VEX_LIG; +defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + f128mem, load, "cvtsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_LIG; +defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si", + SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG; defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, - f128mem, load, "cvtsd2si{l}">, XD; + f128mem, load, "cvtsd2si{l}", SSE_CVT_SD2SI>, XD; defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, - f128mem, load, "cvtsd2si{q}">, XD, REX_W; + f128mem, load, "cvtsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W; defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V; + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", + SSE_CVT_Scalar, 0>, XS, VEX_4V; defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", + SSE_CVT_Scalar, 0>, XS, VEX_4V, VEX_W; defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V; + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", + SSE_CVT_Scalar, 0>, XD, VEX_4V; defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", + SSE_CVT_Scalar, 0>, XD, VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in { defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse_cvtsi2ss, i32mem, loadi32, - "cvtsi2ss">, XS; + "cvtsi2ss", SSE_CVT_Scalar>, XS; defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse_cvtsi642ss, i64mem, loadi64, - "cvtsi2ss{q}">, XS, REX_W; + "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse2_cvtsi2sd, i32mem, loadi32, - "cvtsi2sd">, XD; + "cvtsi2sd", SSE_CVT_Scalar>, XD; defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse2_cvtsi642sd, i64mem, loadi64, - "cvtsi2sd">, XD, REX_W; + "cvtsi2sd", SSE_CVT_Scalar>, XD, REX_W; } /// SSE 1 Only // Aliases for intrinsics defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, - f32mem, load, "cvttss2si">, XS, VEX; + f32mem, load, "cvttss2si", + SSE_CVT_SS2SI_32>, XS, VEX; defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse_cvttss2si64, f32mem, load, - "cvttss2si">, XS, VEX, VEX_W; + "cvttss2si", SSE_CVT_SS2SI_64>, + XS, VEX, VEX_W; defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, - f128mem, load, "cvttsd2si">, XD, VEX; + f128mem, load, "cvttsd2si", SSE_CVT_SD2SI>, + XD, VEX; defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, f128mem, load, - "cvttsd2si">, XD, VEX, VEX_W; + "cvttsd2si", SSE_CVT_SD2SI>, + XD, VEX, VEX_W; defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, - f32mem, load, "cvttss2si">, XS; + f32mem, load, "cvttss2si", + SSE_CVT_SS2SI_32>, XS; defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse_cvttss2si64, f32mem, load, - "cvttss2si{q}">, XS, REX_W; + "cvttss2si{q}", SSE_CVT_SS2SI_64>, + XS, REX_W; defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, - f128mem, load, "cvttsd2si">, XD; + f128mem, load, "cvttsd2si", SSE_CVT_SD2SI>, + XD; defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, f128mem, load, - "cvttsd2si{q}">, XD, REX_W; + "cvttsd2si{q}", SSE_CVT_SD2SI>, + XD, REX_W; let Pattern = []<dag> in { defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load, - "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, - VEX, VEX_LIG; + "cvtss2si{l}\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load, - "cvtss2si\t{$src, $dst|$dst, $src}">, XS, VEX, - VEX_W, VEX_LIG; + "cvtss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load, "cvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle>, TB, VEX; + SSEPackedSingle, SSE_CVT_PS>, TB, VEX; defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load, "cvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle>, TB, VEX; + SSEPackedSingle, SSE_CVT_PS>, TB, VEX; } let Pattern = []<dag> in { defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/, - "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS; + "cvtss2si{l}\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_32>, XS; defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/, - "cvtss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W; + "cvtss2si{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_64>, XS, REX_W; defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/, "cvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle>, TB; /* PD SSE3 form is avaiable */ + SSEPackedSingle, SSE_CVT_PS>, + TB; /* PD SSE3 form is avaiable */ } -let Predicates = [HasSSE1] in { +let Predicates = [HasAVX] in { def : Pat<(int_x86_sse_cvtss2si VR128:$src), - (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; def : Pat<(int_x86_sse_cvtss2si (load addr:$src)), - (CVTSS2SIrm addr:$src)>; + (VCVTSS2SIrm addr:$src)>; def : Pat<(int_x86_sse_cvtss2si64 VR128:$src), - (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)), - (CVTSS2SI64rm addr:$src)>; + (VCVTSS2SI64rm addr:$src)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasSSE1] in { def : Pat<(int_x86_sse_cvtss2si VR128:$src), - (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; def : Pat<(int_x86_sse_cvtss2si (load addr:$src)), - (VCVTSS2SIrm addr:$src)>; + (CVTSS2SIrm addr:$src)>; def : Pat<(int_x86_sse_cvtss2si64 VR128:$src), - (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)), - (VCVTSS2SI64rm addr:$src)>; + (CVTSS2SI64rm addr:$src)>; } /// SSE 2 Only @@ -1566,43 +1677,51 @@ let Predicates = [HasAVX] in { // Convert scalar double to scalar single def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src1, FR64:$src2), - "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - VEX_4V, VEX_LIG; + "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG; let mayLoad = 1 in def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR64:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG; + [], IIC_SSE_CVT_Scalar_RM>, + XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG; def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, Requires<[HasAVX]>; def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (fround FR64:$src))]>; + [(set FR32:$dst, (fround FR64:$src))], + IIC_SSE_CVT_Scalar_RR>; def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD, + [(set FR32:$dst, (fround (loadf64 addr:$src)))], + IIC_SSE_CVT_Scalar_RM>, + XD, Requires<[HasSSE2, OptForSize]>; defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, - int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>, + int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", + SSE_CVT_Scalar, 0>, XS, VEX_4V; let Constraints = "$src1 = $dst" in defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, - int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS; + int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", + SSE_CVT_Scalar>, XS; // Convert scalar single to scalar double // SSE2 instructions with XS prefix def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG; + [], IIC_SSE_CVT_Scalar_RR>, + XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG; let mayLoad = 1 in def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR32:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>; + [], IIC_SSE_CVT_Scalar_RM>, + XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>; let Predicates = [HasAVX] in { def : Pat<(f64 (fextend FR32:$src)), @@ -1619,11 +1738,13 @@ def : Pat<(extloadf32 addr:$src), def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (fextend FR32:$src))]>, XS, + [(set FR64:$dst, (fextend FR32:$src))], + IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>; def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, + [(set FR64:$dst, (extloadf32 addr:$src))], + IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2, OptForSize]>; // extload f32 -> f64. This matches load+fextend because we have a hack in @@ -1640,26 +1761,30 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, - VR128:$src2))]>, XS, VEX_4V, + VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>; def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, - (load addr:$src2)))]>, XS, VEX_4V, + (load addr:$src2)))], + IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, - VR128:$src2))]>, XS, + VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>; def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, - (load addr:$src2)))]>, XS, + (load addr:$src2)))], + IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2]>; } @@ -1667,216 +1792,275 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, // SSE2 instructions without OpSize prefix def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))], + IIC_SSE_CVT_PS_RR>, TB, VEX, Requires<[HasAVX]>; def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vcvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps - (bitconvert (memopv2i64 addr:$src))))]>, + (bitconvert (memopv2i64 addr:$src))))], + IIC_SSE_CVT_PS_RM>, TB, VEX, Requires<[HasAVX]>; def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))], + IIC_SSE_CVT_PS_RR>, TB, Requires<[HasSSE2]>; def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "cvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps - (bitconvert (memopv2i64 addr:$src))))]>, + (bitconvert (memopv2i64 addr:$src))))], + IIC_SSE_CVT_PS_RM>, TB, Requires<[HasSSE2]>; // FIXME: why the non-intrinsic version is described as SSE3? // SSE2 instructions with XS prefix def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], + IIC_SSE_CVT_PD_RR>, XS, VEX, Requires<[HasAVX]>; def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd - (bitconvert (memopv2i64 addr:$src))))]>, + (bitconvert (memopv2i64 addr:$src))))], + IIC_SSE_CVT_PD_RM>, XS, VEX, Requires<[HasAVX]>; def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], + IIC_SSE_CVT_PD_RR>, XS, Requires<[HasSSE2]>; def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd - (bitconvert (memopv2i64 addr:$src))))]>, + (bitconvert (memopv2i64 addr:$src))))], + IIC_SSE_CVT_PD_RM>, XS, Requires<[HasSSE2]>; // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtps2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PS_RR>, VEX; def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtps2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PS_RM>, VEX; def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtps2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PS_RR>, VEX; def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtps2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PS_RM>, VEX; def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", []>; + "cvtps2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PS_RR>; def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", []>; + "cvtps2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PS_RM>; def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX; def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq - (memop addr:$src)))]>, VEX; + (memop addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX; def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>; + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>; def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq - (memop addr:$src)))]>; + (memop addr:$src)))], + IIC_SSE_CVT_PS_RM>; // SSE2 packed instructions with XD prefix def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, XD, VEX, Requires<[HasAVX]>; def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq - (memop addr:$src)))]>, + (memop addr:$src)))], + IIC_SSE_CVT_PD_RM>, XD, VEX, Requires<[HasAVX]>; def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, XD, Requires<[HasSSE2]>; def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq - (memop addr:$src)))]>, + (memop addr:$src)))], + IIC_SSE_CVT_PD_RM>, XD, Requires<[HasSSE2]>; // Convert with truncation packed single/double fp to doubleword // SSE2 packed instructions with XS prefix def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; -let mayLoad = 1 in + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX; def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttps2dq + (memop addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX; def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; -let mayLoad = 1 in + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], + IIC_SSE_CVT_PS_RR>, VEX; def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), - "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 + (memopv8f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX; + def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttps2dq VR128:$src))]>; + (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>; def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttps2dq (memop addr:$src)))]>; - -def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvttps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (int_x86_sse2_cvttps2dq VR128:$src))]>, - XS, VEX, Requires<[HasAVX]>; -def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "vcvttps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttps2dq - (memop addr:$src)))]>, - XS, VEX, Requires<[HasAVX]>; - -let Predicates = [HasSSE2] in { - def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), - (Int_CVTDQ2PSrr VR128:$src)>; - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), - (CVTTPS2DQrr VR128:$src)>; -} + (int_x86_sse2_cvttps2dq (memop addr:$src)))], + IIC_SSE_CVT_PS_RM>; let Predicates = [HasAVX] in { def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (Int_VCVTDQ2PSrr VR128:$src)>; + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + (Int_VCVTDQ2PSrm addr:$src)>; + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), (VCVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), + (VCVTTPS2DQrm addr:$src)>; + def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))), (VCVTDQ2PSYrr VR256:$src)>; + def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))), + (VCVTDQ2PSYrm addr:$src)>; + def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), (VCVTTPS2DQYrr VR256:$src)>; + def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))), + (VCVTTPS2DQYrm addr:$src)>; +} + +let Predicates = [HasSSE2] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + (Int_CVTDQ2PSrr VR128:$src)>; + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + (Int_CVTDQ2PSrm addr:$src)>; + + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (CVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), + (CVTTPS2DQrm addr:$src)>; } def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttpd2dq VR128:$src))]>, VEX; + (int_x86_sse2_cvttpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, VEX; let isCodeGenOnly = 1 in def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (memop addr:$src)))]>, VEX; + (memop addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX; def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>; + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>; def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (memop addr:$src)))]>; + (memop addr:$src)))], + IIC_SSE_CVT_PD_RM>; // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvttpd2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, VEX; // XMM only def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; + "cvttpd2dqx\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, VEX; def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; + "cvttpd2dqx\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>, VEX; // YMM only def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; + "cvttpd2dqy\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, VEX; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), - "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; + "cvttpd2dqy\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L; // Convert packed single to packed double let Predicates = [HasAVX] in { // SSE2 instructions without OpSize prefix def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, TB, VEX; def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>, TB, VEX; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, TB, VEX; def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; + "vcvtps2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>, TB, VEX; } def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; + "cvtps2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, TB; def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; + "cvtps2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>, TB; def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], + IIC_SSE_CVT_PD_RR>, TB, VEX, Requires<[HasAVX]>; def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd - (load addr:$src)))]>, + (load addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB, VEX, Requires<[HasAVX]>; def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], + IIC_SSE_CVT_PD_RR>, TB, Requires<[HasSSE2]>; def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd - (load addr:$src)))]>, + (load addr:$src)))], + IIC_SSE_CVT_PD_RM>, TB, Requires<[HasSSE2]>; // Convert packed double to packed single @@ -1884,49 +2068,61 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtpd2ps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, VEX; def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtpd2ps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, VEX; // XMM only def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtpd2psx\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, VEX; def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtpd2psx\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>, VEX; // YMM only def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtpd2psy\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, VEX; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), - "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; + "cvtpd2psy\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L; def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; + "cvtpd2ps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>; def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; + "cvtpd2ps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>; def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], + IIC_SSE_CVT_PD_RR>; def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps - (memop addr:$src)))]>; + (memop addr:$src)))], + IIC_SSE_CVT_PD_RM>; def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], + IIC_SSE_CVT_PD_RR>; def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps - (memop addr:$src)))]>; + (memop addr:$src)))], + IIC_SSE_CVT_PD_RM>; // AVX 256-bit register conversion intrinsics // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below // whenever possible to avoid declaring two versions of each one. def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), (VCVTDQ2PSYrr VR256:$src)>; -def : Pat<(int_x86_avx_cvtdq2_ps_256 (memopv8i32 addr:$src)), +def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))), (VCVTDQ2PSYrm addr:$src)>; def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src), @@ -1949,11 +2145,6 @@ def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src), def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)), (VCVTTPD2DQYrm addr:$src)>; -def : Pat<(int_x86_avx_cvtt_ps2dq_256 VR256:$src), - (VCVTTPS2DQYrr VR256:$src)>; -def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)), - (VCVTTPS2DQYrm addr:$src)>; - // Match fround and fextend for 128/256-bit conversions def : Pat<(v4f32 (fround (v4f64 VR256:$src))), (VCVTPD2PSYrr VR256:$src)>; @@ -1971,70 +2162,85 @@ def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))), // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - SDNode OpNode, ValueType VT, PatFrag ld_frag, - string asm, string asm_alt> { + Operand CC, SDNode OpNode, ValueType VT, + PatFrag ld_frag, string asm, string asm_alt, + OpndItins itins> { def rr : SIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm, - [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>; + (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], + itins.rr>; def rm : SIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), imm:$cc))]>; + (ld_frag addr:$src2), imm:$cc))], + itins.rm>; // Accept explicit immediate argument form instead of comparison code. let neverHasSideEffects = 1 in { def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, []>; + (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [], + IIC_SSE_ALU_F32S_RR>; let mayLoad = 1 in def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, []>; + (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [], + IIC_SSE_ALU_F32S_RM>; } } -defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32, +defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32, "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSE_ALU_F32S>, XS, VEX_4V, VEX_LIG; -defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64, +defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64, "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSE_ALU_F32S>, // same latency as 32 bit compare XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { - defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32, + defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32, "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", - "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}">, + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>, XS; - defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64, + defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64, "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", - "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}">, + "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSE_ALU_F32S>, // same latency as 32 bit compare XD; } -multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop, - Intrinsic Int, string asm> { +multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, + Intrinsic Int, string asm, OpndItins itins> { def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src, SSECC:$cc), asm, + (ins VR128:$src1, VR128:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - VR128:$src, imm:$cc))]>; + VR128:$src, imm:$cc))], + itins.rr>; def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f32mem:$src, SSECC:$cc), asm, + (ins VR128:$src1, x86memop:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - (load addr:$src), imm:$cc))]>; + (load addr:$src), imm:$cc))], + itins.rm>; } // Aliases to match intrinsics which expect XMM operand(s). -defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">, +defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", + SSE_ALU_F32S>, XS, VEX_4V; -defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">, +defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", + SSE_ALU_F32S>, // same latency as f32 XD, VEX_4V; let Constraints = "$src1 = $dst" in { - defm Int_CMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS; - defm Int_CMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $dst|$dst, $src}">, XD; + defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $dst|$dst, $src}", + SSE_ALU_F32S>, XS; + defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $dst|$dst, $src}", + SSE_ALU_F32S>, // same latency as f32 + XD; } @@ -2044,11 +2250,13 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, PatFrag ld_frag, string OpcodeStr, Domain d> { def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), - [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], d>; + [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], + IIC_SSE_COMIS_RR, d>; def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), - (ld_frag addr:$src2)))], d>; + (ld_frag addr:$src2)))], + IIC_SSE_COMIS_RM, d>; } let Defs = [EFLAGS] in { @@ -2098,89 +2306,91 @@ let Defs = [EFLAGS] in { "comisd", SSEPackedDouble>, TB, OpSize; } // Defs = [EFLAGS] -// sse12_cmp_packed - sse 1 & 2 compared packed instructions +// sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, - Intrinsic Int, string asm, string asm_alt, - Domain d> { - let isAsmParserOnly = 1 in { - def rri : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], d>; - def rmi : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, f128mem:$src2, SSECC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], d>; - } + Operand CC, Intrinsic Int, string asm, + string asm_alt, Domain d> { + def rri : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], + IIC_SSE_CMPP_RR, d>; + def rmi : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], + IIC_SSE_CMPP_RM, d>; // Accept explicit immediate argument form instead of comparison code. - def rri_alt : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - asm_alt, [], d>; - def rmi_alt : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, f128mem:$src2, i8imm:$cc), - asm_alt, [], d>; + let neverHasSideEffects = 1 in { + def rri_alt : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_CMPP_RR, d>; + def rmi_alt : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_CMPP_RM, d>; + } } -defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSEPackedSingle>, TB, VEX_4V; -defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSEPackedDouble>, TB, OpSize, VEX_4V; -defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256, +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSEPackedSingle>, TB, VEX_4V; -defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256, +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSEPackedDouble>, TB, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { - defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, + defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSEPackedSingle>, TB; - defm CMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, + defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSEPackedDouble>, TB, OpSize; } -let Predicates = [HasSSE1] in { -def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), - (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), - (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; -} - -let Predicates = [HasSSE2] in { -def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), - (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), - (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; -} - let Predicates = [HasAVX] in { -def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; -def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), +def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; -def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)), (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; -def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), +def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; -def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; } +let Predicates = [HasSSE1] in { +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), + (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; +} + +let Predicates = [HasSSE2] in { +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), + (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Shuffle Instructions //===----------------------------------------------------------------------===// @@ -2190,14 +2400,14 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, Domain d, bit IsConvertibleToThreeAddress = 0> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, f128mem:$src2, i8imm:$src3), asm, - [(set RC:$dst, (vt (shufp:$src3 - RC:$src1, (mem_frag addr:$src2))))], d>; + (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>; let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$src3), asm, - [(set RC:$dst, - (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>; + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>; } defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -2220,133 +2430,52 @@ let Constraints = "$src1 = $dst" in { TB; defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv2f64, SSEPackedDouble>, TB, OpSize; -} - -let Predicates = [HasSSE1] in { - def : Pat<(v4f32 (X86Shufps VR128:$src1, - (memopv4f32 addr:$src2), (i8 imm:$imm))), - (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; - def : Pat<(v4i32 (X86Shufps VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), - (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; - // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but - // fall back to this for SSE1) - def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))), - (SHUFPSrri VR128:$src2, VR128:$src1, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Special unary SHUFPSrri case. - def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))), - (SHUFPSrri VR128:$src1, VR128:$src1, - (SHUFFLE_get_shuf_imm VR128:$src3))>; -} - -let Predicates = [HasSSE2] in { - // Special binary v4i32 shuffle cases with SHUFPS. - def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))), - (SHUFPSrri VR128:$src1, VR128:$src2, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - def : Pat<(v4i32 (shufp:$src3 VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)))), - (SHUFPSrmi VR128:$src1, addr:$src2, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Special unary SHUFPDrri cases. - def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))), - (SHUFPDrri VR128:$src1, VR128:$src1, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))), - (SHUFPDrri VR128:$src1, VR128:$src1, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Special binary v2i64 shuffle cases using SHUFPDrri. - def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)), - (SHUFPDrri VR128:$src1, VR128:$src2, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Generic SHUFPD patterns - def : Pat<(v2f64 (X86Shufps VR128:$src1, - (memopv2f64 addr:$src2), (i8 imm:$imm))), - (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; - def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; + memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, + TB, OpSize; } let Predicates = [HasAVX] in { - def : Pat<(v4f32 (X86Shufps VR128:$src1, - (memopv4f32 addr:$src2), (i8 imm:$imm))), - (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; - def : Pat<(v4i32 (X86Shufps VR128:$src1, + def : Pat<(v4i32 (X86Shufp VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))), + def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; - // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but - // fall back to this for SSE1) - def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))), - (VSHUFPSrri VR128:$src2, VR128:$src1, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Special unary SHUFPSrri case. - def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))), - (VSHUFPSrri VR128:$src1, VR128:$src1, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Special binary v4i32 shuffle cases with SHUFPS. - def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))), - (VSHUFPSrri VR128:$src1, VR128:$src2, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - def : Pat<(v4i32 (shufp:$src3 VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)))), - (VSHUFPSrmi VR128:$src1, addr:$src2, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Special unary SHUFPDrri cases. - def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))), - (VSHUFPDrri VR128:$src1, VR128:$src1, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))), - (VSHUFPDrri VR128:$src1, VR128:$src1, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - // Special binary v2i64 shuffle cases using SHUFPDrri. - def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)), - (VSHUFPDrri VR128:$src1, VR128:$src2, - (SHUFFLE_get_shuf_imm VR128:$src3))>; - - def : Pat<(v2f64 (X86Shufps VR128:$src1, - (memopv2f64 addr:$src2), (i8 imm:$imm))), + + def : Pat<(v2i64 (X86Shufp VR128:$src1, + (memopv2i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; - def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), + def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; // 256-bit patterns - def : Pat<(v8i32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))), + def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; - def : Pat<(v8i32 (X86Shufps VR256:$src1, + def : Pat<(v8i32 (X86Shufp VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v8f32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; - def : Pat<(v8f32 (X86Shufps VR256:$src1, - (memopv8f32 addr:$src2), (i8 imm:$imm))), - (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; - - def : Pat<(v4i64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))), + def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; - def : Pat<(v4i64 (X86Shufpd VR256:$src1, + def : Pat<(v4i64 (X86Shufp VR256:$src1, (memopv4i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; +} - def : Pat<(v4f64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; - def : Pat<(v4f64 (X86Shufpd VR256:$src1, - (memopv4f64 addr:$src2), (i8 imm:$imm))), - (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; +let Predicates = [HasSSE1] in { + def : Pat<(v4i32 (X86Shufp VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), + (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; +} + +let Predicates = [HasSSE2] in { + // Generic SHUFPD patterns + def : Pat<(v2i64 (X86Shufp VR128:$src1, + (memopv2i64 addr:$src2), (i8 imm:$imm))), + (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; } //===----------------------------------------------------------------------===// @@ -2354,159 +2483,80 @@ let Predicates = [HasAVX] in { //===----------------------------------------------------------------------===// /// sse12_unpack_interleave - sse 1 & 2 unpack and interleave -multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt, +multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, PatFrag mem_frag, RegisterClass RC, X86MemOperand x86memop, string asm, Domain d> { def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), asm, [(set RC:$dst, - (vt (OpNode RC:$src1, RC:$src2)))], d>; + (vt (OpNode RC:$src1, RC:$src2)))], + IIC_SSE_UNPCK, d>; def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), asm, [(set RC:$dst, (vt (OpNode RC:$src1, - (mem_frag addr:$src2))))], d>; -} - -let AddedComplexity = 10 in { - defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, - VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V; - defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, - VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V; - defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, - VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V; - defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, - VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V; - - defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, - VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V; - defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, - VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V; - defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, - VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, TB, VEX_4V; - defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, - VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, TB, OpSize, VEX_4V; - - let Constraints = "$src1 = $dst" in { - defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, - VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", - SSEPackedSingle>, TB; - defm UNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, - VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", - SSEPackedDouble>, TB, OpSize; - defm UNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, - VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", - SSEPackedSingle>, TB; - defm UNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, - VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", - SSEPackedDouble>, TB, OpSize; - } // Constraints = "$src1 = $dst" -} // AddedComplexity + (mem_frag addr:$src2))))], + IIC_SSE_UNPCK, d>; +} + +defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; +defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; +defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; + +defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32, + VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; +defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64, + VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32, + VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; +defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64, + VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; -let Predicates = [HasSSE1] in { - def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), - (UNPCKLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), - (UNPCKLPSrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))), - (UNPCKHPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), - (UNPCKHPSrr VR128:$src1, VR128:$src2)>; -} +let Constraints = "$src1 = $dst" in { + defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, TB; + defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", + SSEPackedDouble>, TB, OpSize; + defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, TB; + defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", + SSEPackedDouble>, TB, OpSize; +} // Constraints = "$src1 = $dst" -let Predicates = [HasSSE2] in { - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), - (UNPCKLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), - (UNPCKLPDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))), - (UNPCKHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)), - (UNPCKHPDrr VR128:$src1, VR128:$src2)>; - - // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the +let Predicates = [HasAVX], AddedComplexity = 1 in { + // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the // problem is during lowering, where it's not possible to recognize the load // fold cause it has two uses through a bitcast. One use disappears at isel // time and the fold opportunity reappears. def : Pat<(v2f64 (X86Movddup VR128:$src)), - (UNPCKLPDrr VR128:$src, VR128:$src)>; - - let AddedComplexity = 10 in - def : Pat<(splat_lo (v2f64 VR128:$src), (undef)), - (UNPCKLPDrr VR128:$src, VR128:$src)>; + (VUNPCKLPDrr VR128:$src, VR128:$src)>; } -let Predicates = [HasAVX] in { - def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), - (VUNPCKLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), - (VUNPCKLPSrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))), - (VUNPCKHPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), - (VUNPCKHPSrr VR128:$src1, VR128:$src2)>; - - def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))), - (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)), - (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, VR256:$src2)), - (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, (memopv8i32 addr:$src2))), - (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, (memopv8f32 addr:$src2))), - (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, VR256:$src2)), - (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, (memopv8i32 addr:$src2))), - (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, VR256:$src2)), - (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; - - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), - (VUNPCKLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), - (VUNPCKLPDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))), - (VUNPCKHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)), - (VUNPCKHPDrr VR128:$src1, VR128:$src2)>; - - def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))), - (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)), - (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, (memopv4i64 addr:$src2))), - (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, VR256:$src2)), - (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, (memopv4f64 addr:$src2))), - (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, VR256:$src2)), - (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))), - (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; - def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)), - (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; - - // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the +let Predicates = [HasSSE2] in { + // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the // problem is during lowering, where it's not possible to recognize the load // fold cause it has two uses through a bitcast. One use disappears at isel // time and the fold opportunity reappears. def : Pat<(v2f64 (X86Movddup VR128:$src)), - (VUNPCKLPDrr VR128:$src, VR128:$src)>; - let AddedComplexity = 10 in - def : Pat<(splat_lo (v2f64 VR128:$src), (undef)), - (VUNPCKLPDrr VR128:$src, VR128:$src)>; + (UNPCKLPDrr VR128:$src, VR128:$src)>; } //===----------------------------------------------------------------------===// @@ -2518,29 +2568,12 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, Domain d> { def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set GR32:$dst, (Int RC:$src))], d>; + [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>; def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, REX_W; + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], + IIC_SSE_MOVMSK, d>, REX_W; } -defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", - SSEPackedSingle>, TB; -defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", - SSEPackedDouble>, TB, OpSize; - -def : Pat<(i32 (X86fgetsign FR32:$src)), - (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, - sub_ss))>, Requires<[HasSSE1]>; -def : Pat<(i64 (X86fgetsign FR32:$src)), - (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, - sub_ss))>, Requires<[HasSSE1]>; -def : Pat<(i32 (X86fgetsign FR64:$src)), - (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, - sub_sd))>, Requires<[HasSSE2]>; -def : Pat<(i64 (X86fgetsign FR64:$src)), - (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, - sub_sd))>, Requires<[HasSSE2]>; - let Predicates = [HasAVX] in { defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", SSEPackedSingle>, TB, VEX; @@ -2568,17 +2601,105 @@ let Predicates = [HasAVX] in { // Assembler Only def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX; + "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, + SSEPackedSingle>, TB, VEX; def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB, + "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, + SSEPackedDouble>, TB, OpSize, VEX; def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX; + "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, + SSEPackedSingle>, TB, VEX; def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB, + "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, + SSEPackedDouble>, TB, OpSize, VEX; } +defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", + SSEPackedSingle>, TB; +defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", + SSEPackedDouble>, TB, OpSize; + +def : Pat<(i32 (X86fgetsign FR32:$src)), + (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, + sub_ss))>, Requires<[HasSSE1]>; +def : Pat<(i64 (X86fgetsign FR32:$src)), + (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, + sub_ss))>, Requires<[HasSSE1]>; +def : Pat<(i32 (X86fgetsign FR64:$src)), + (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, + sub_sd))>, Requires<[HasSSE2]>; +def : Pat<(i64 (X86fgetsign FR64:$src)), + (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, + sub_sd))>, Requires<[HasSSE2]>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Logical Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +/// PDI_binop_rm - Simple SSE2 binary operator. +multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, + bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))], + itins.rm>; +} +} // ExeDomain = SSEPackedInt + +// These are ordered here for pattern ordering requirements with the fp versions + +let Predicates = [HasAVX] in { +defm VPAND : PDI_binop_rm<0xDB, "vpand", and, v2i64, VR128, memopv2i64, + i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; +defm VPOR : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64, + i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; +defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64, + i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; +defm VPANDN : PDI_binop_rm<0xDF, "vpandn", X86andnp, v2i64, VR128, memopv2i64, + i128mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { +defm PAND : PDI_binop_rm<0xDB, "pand", and, v2i64, VR128, memopv2i64, + i128mem, SSE_BIT_ITINS_P, 1>; +defm POR : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64, + i128mem, SSE_BIT_ITINS_P, 1>; +defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64, + i128mem, SSE_BIT_ITINS_P, 1>; +defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64, + i128mem, SSE_BIT_ITINS_P, 0>; +} // Constraints = "$src1 = $dst" + +let Predicates = [HasAVX2] in { +defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64, + i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; +defm VPORY : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64, + i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; +defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64, + i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V; +defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64, + i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Logical Instructions //===----------------------------------------------------------------------===// @@ -2586,31 +2707,39 @@ let Predicates = [HasAVX] in { /// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops /// multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, TB, VEX_4V; + FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>, + TB, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, TB, OpSize, VEX_4V; + FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>, + TB, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, - f32, f128mem, memopfsf32, SSEPackedSingle>, TB; + f32, f128mem, memopfsf32, SSEPackedSingle, itins>, + TB; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, - f64, f128mem, memopfsf64, SSEPackedDouble>, TB, OpSize; + f64, f128mem, memopfsf64, SSEPackedDouble, itins>, + TB, OpSize; } } // Alias bitwise logical operations using SSE logical ops on packed FP values. let mayLoad = 0 in { - defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand>; - defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for>; - defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor>; + defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, + SSE_BIT_ITINS_P>; + defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, + SSE_BIT_ITINS_P>; + defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; } let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in - defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef>; + defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, + SSE_BIT_ITINS_P>; /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops /// @@ -2623,7 +2752,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, [], [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V; + (memopv2i64 addr:$src2)))], 0, 1>, TB, VEX_4V; defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f128mem, @@ -2697,118 +2826,145 @@ let isCommutable = 0 in /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those /// classes below multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins, bit Is2Addr = 1> { defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), - OpNode, FR32, f32mem, Is2Addr>, XS; + OpNode, FR32, f32mem, + itins.s, Is2Addr>, XS; defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), - OpNode, FR64, f64mem, Is2Addr>, XD; + OpNode, FR64, f64mem, + itins.d, Is2Addr>, XD; } multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins, bit Is2Addr = 1> { let mayLoad = 0 in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, - v4f32, f128mem, memopv4f32, SSEPackedSingle, Is2Addr>, TB; + v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>, + TB; defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, - v2f64, f128mem, memopv2f64, SSEPackedDouble, Is2Addr>, TB, OpSize; + v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>, + TB, OpSize; } } multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr, - SDNode OpNode> { + SDNode OpNode, + SizeItins itins> { let mayLoad = 0 in { defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, - v8f32, f256mem, memopv8f32, SSEPackedSingle, 0>, TB; + v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>, + TB; defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, - v4f64, f256mem, memopv4f64, SSEPackedDouble, 0>, TB, OpSize; + v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>, + TB, OpSize; } } multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, + SizeItins itins, bit Is2Addr = 1> { defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, Is2Addr>, XS; + !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, + itins.s, Is2Addr>, XS; defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, Is2Addr>, XD; + !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, + itins.d, Is2Addr>, XD; } multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr, + SizeItins itins, bit Is2Addr = 1> { defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128, !strconcat(OpcodeStr, "ps"), "sse", "_ps", f128mem, memopv4f32, - SSEPackedSingle, Is2Addr>, TB; + SSEPackedSingle, itins.s, Is2Addr>, + TB; defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128, !strconcat(OpcodeStr, "pd"), "sse2", "_pd", f128mem, memopv2f64, - SSEPackedDouble, Is2Addr>, TB, OpSize; + SSEPackedDouble, itins.d, Is2Addr>, + TB, OpSize; } -multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> { +multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr, + SizeItins itins> { defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256, !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32, - SSEPackedSingle, 0>, TB; + SSEPackedSingle, itins.s, 0>, TB; defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256, !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64, - SSEPackedDouble, 0>, TB, OpSize; + SSEPackedDouble, itins.d, 0>, TB, OpSize; } // Binary Arithmetic instructions -defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, - basic_sse12_fp_binop_s_int<0x58, "add", 0>, VEX_4V, VEX_LIG; -defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, - basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; -defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, - basic_sse12_fp_binop_s_int<0x59, "mul", 0>, VEX_4V, VEX_LIG; -defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, - basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; +defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; +defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P, 0>, + basic_sse12_fp_binop_p_y<0x58, "add", fadd, SSE_ALU_ITINS_P>, + VEX_4V; +defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>, + VEX_4V, VEX_LIG; +defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P, 0>, + basic_sse12_fp_binop_p_y<0x59, "mul", fmul, SSE_MUL_ITINS_P>, + VEX_4V; let isCommutable = 0 in { - defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, - basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, VEX_4V, VEX_LIG; - defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, - basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; - defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, - basic_sse12_fp_binop_s_int<0x5E, "div", 0>, VEX_4V, VEX_LIG; - defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, - basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; - defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, - basic_sse12_fp_binop_s_int<0x5F, "max", 0>, VEX_4V, VEX_LIG; - defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, - basic_sse12_fp_binop_p_int<0x5F, "max", 0>, - basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, - basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V; - defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, - basic_sse12_fp_binop_s_int<0x5D, "min", 0>, VEX_4V, VEX_LIG; - defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, - basic_sse12_fp_binop_p_int<0x5D, "min", 0>, - basic_sse12_fp_binop_p_y_int<0x5D, "min">, - basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; + defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; + defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P, 0>, + basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, VEX_4V; + defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>, + VEX_4V, VEX_LIG; + defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_ALU_ITINS_P, 0>, + basic_sse12_fp_binop_p_y<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, + VEX_4V; + defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; + defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P, 0>, + basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P, 0>, + basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_p_y_int<0x5F, "max", SSE_ALU_ITINS_P>, + VEX_4V; + defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>, + basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>, + VEX_4V, VEX_LIG; + defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P, 0>, + basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P, 0>, + basic_sse12_fp_binop_p_y_int<0x5D, "min", SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, + VEX_4V; } let Constraints = "$src1 = $dst" in { - defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd>, - basic_sse12_fp_binop_p<0x58, "add", fadd>, - basic_sse12_fp_binop_s_int<0x58, "add">; - defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul>, - basic_sse12_fp_binop_p<0x59, "mul", fmul>, - basic_sse12_fp_binop_s_int<0x59, "mul">; + defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; + defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, + basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>, + basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; let isCommutable = 0 in { - defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub>, - basic_sse12_fp_binop_p<0x5C, "sub", fsub>, - basic_sse12_fp_binop_s_int<0x5C, "sub">; - defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv>, - basic_sse12_fp_binop_p<0x5E, "div", fdiv>, - basic_sse12_fp_binop_s_int<0x5E, "div">; - defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax>, - basic_sse12_fp_binop_p<0x5F, "max", X86fmax>, - basic_sse12_fp_binop_s_int<0x5F, "max">, - basic_sse12_fp_binop_p_int<0x5F, "max">; - defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin>, - basic_sse12_fp_binop_p<0x5D, "min", X86fmin>, - basic_sse12_fp_binop_s_int<0x5D, "min">, - basic_sse12_fp_binop_p_int<0x5D, "min">; + defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; + defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, + basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, + basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; + defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P>; + defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P>; } } @@ -2820,9 +2976,25 @@ let Constraints = "$src1 = $dst" in { /// /// And, we have a special variant form for a full-vector intrinsic form. +def SSE_SQRTP : OpndItins< + IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM +>; + +def SSE_SQRTS : OpndItins< + IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM +>; + +def SSE_RCPP : OpndItins< + IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM +>; + +def SSE_RCPS : OpndItins< + IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM +>; + /// sse1_fp_unop_s - SSE1 unops in scalar form. multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, - SDNode OpNode, Intrinsic F32Int> { + SDNode OpNode, Intrinsic F32Int, OpndItins itins> { def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), [(set FR32:$dst, (OpNode FR32:$src))]>; @@ -2832,14 +3004,14 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, // partial register update condition. def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS, + [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, Requires<[HasSSE1, OptForSize]>; def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F32Int VR128:$src))]>; + [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>; def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F32Int sse_load_f32:$src))]>; + [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>; } /// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form. @@ -2852,80 +3024,91 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins ssmem:$src1, VR128:$src2), + (ins VR128:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; } /// sse1_fp_unop_p - SSE1 unops in packed form. -multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>; + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>; def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>; + [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>; } /// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form. -multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>; + [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))], + itins.rr>; def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))]>; + [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))], + itins.rm>; } /// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms. multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr, - Intrinsic V4F32Int> { + Intrinsic V4F32Int, OpndItins itins> { def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V4F32Int VR128:$src))]>; + [(set VR128:$dst, (V4F32Int VR128:$src))], + itins.rr>; def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>; + [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))], + itins.rm>; } /// sse1_fp_unop_p_y_int - AVX 256-bit intrinsics unops in packed forms. multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, - Intrinsic V4F32Int> { + Intrinsic V4F32Int, OpndItins itins> { def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V4F32Int VR256:$src))]>; + [(set VR256:$dst, (V4F32Int VR256:$src))], + itins.rr>; def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))]>; + [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))], + itins.rm>; } /// sse2_fp_unop_s - SSE2 unops in scalar form. multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, - SDNode OpNode, Intrinsic F64Int> { + SDNode OpNode, Intrinsic F64Int, OpndItins itins> { def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set FR64:$dst, (OpNode FR64:$src))]>; + [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>; // See the comments in sse1_fp_unop_s for why this is OptForSize. def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set FR64:$dst, (OpNode (load addr:$src)))]>, XD, + [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD, Requires<[HasSSE2, OptForSize]>; def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F64Int VR128:$src))]>; + [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>; def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src), !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F64Int sse_load_f64:$src))]>; + [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>; } /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form. multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { + let neverHasSideEffects = 1 in { def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + let mayLoad = 1 in def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + } def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), !strconcat(OpcodeStr, @@ -2934,45 +3117,52 @@ multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>; + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>; def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>; + [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>; } /// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms. -multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>; + [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))], + itins.rr>; def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))]>; + [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))], + itins.rm>; } /// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms. multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr, - Intrinsic V2F64Int> { + Intrinsic V2F64Int, OpndItins itins> { def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V2F64Int VR128:$src))]>; + [(set VR128:$dst, (V2F64Int VR128:$src))], + itins.rr>; def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>; + [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))], + itins.rm>; } /// sse2_fp_unop_p_y_int - AVX 256-bit intrinsic unops in vector forms. multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, - Intrinsic V2F64Int> { + Intrinsic V2F64Int, OpndItins itins> { def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V2F64Int VR256:$src))]>; + [(set VR256:$dst, (V2F64Int VR256:$src))], + itins.rr>; def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>; + [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))], + itins.rm>; } let Predicates = [HasAVX] in { @@ -2980,31 +3170,40 @@ let Predicates = [HasAVX] in { defm VSQRT : sse1_fp_unop_s_avx<0x51, "vsqrt">, sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG; - defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>, - sse2_fp_unop_p<0x51, "vsqrt", fsqrt>, - sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt>, - sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt>, - sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps>, - sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd>, - sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256>, - sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256>, + defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>, + sse2_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>, + sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>, + sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>, + sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps, + SSE_SQRTP>, + sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd, + SSE_SQRTP>, + sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256, + SSE_SQRTP>, + sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256, + SSE_SQRTP>, VEX; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG; - defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>, - sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>, - sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256>, - sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps>, VEX; + defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>, + sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>, + sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256, + SSE_SQRTP>, + sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps, + SSE_SQRTP>, VEX; defm VRCP : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG; - defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp>, - sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>, - sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256>, - sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps>, VEX; + defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp, SSE_RCPP>, + sse1_fp_unop_p_y<0x53, "vrcp", X86frcp, SSE_RCPP>, + sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256, + SSE_RCPP>, + sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps, + SSE_RCPP>, VEX; } +let AddedComplexity = 1 in { def : Pat<(f32 (fsqrt FR32:$src)), (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; def : Pat<(f32 (fsqrt (load addr:$src))), @@ -3027,8 +3226,9 @@ def : Pat<(f32 (X86frcp FR32:$src)), def : Pat<(f32 (X86frcp (load addr:$src))), (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX, OptForSize]>; +} -let Predicates = [HasAVX] in { +let Predicates = [HasAVX], AddedComplexity = 1 in { def : Pat<(int_x86_sse_sqrt_ss VR128:$src), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), (VSQRTSSr (f32 (IMPLICIT_DEF)), @@ -3063,21 +3263,26 @@ let Predicates = [HasAVX] in { } // Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt>, - sse1_fp_unop_p_int<0x51, "sqrt", int_x86_sse_sqrt_ps>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>, - sse2_fp_unop_p<0x51, "sqrt", fsqrt>, - sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd>; +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, + SSE_SQRTS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>, + sse1_fp_unop_p_int<0x51, "sqrt", int_x86_sse_sqrt_ps, SSE_SQRTS>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, + SSE_SQRTS>, + sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>, + sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt>, - sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps>; -defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, - sse1_fp_unop_p<0x53, "rcp", X86frcp>, - sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps>; +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss, + SSE_SQRTS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>, + sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, + SSE_SQRTS>; +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss, + SSE_RCPS>, + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>, + sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>; // There is no f64 version of the reciprocal approximation instructions. @@ -3090,24 +3295,22 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f32 VR128:$src), - addr:$dst)]>, VEX; + addr:$dst)], + IIC_SSE_MOVNT>, VEX; def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntpd\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v2f64 VR128:$src), - addr:$dst)]>, VEX; - def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2f64 VR128:$src), - addr:$dst)]>, VEX; + addr:$dst)], + IIC_SSE_MOVNT>, VEX; let ExeDomain = SSEPackedInt in def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), - addr:$dst)]>, VEX; + [(alignednontemporalstore (v2i64 VR128:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX; def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>; @@ -3116,23 +3319,21 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions (ins f256mem:$dst, VR256:$src), "movntps\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v8f32 VR256:$src), - addr:$dst)]>, VEX; + addr:$dst)], + IIC_SSE_MOVNT>, VEX; def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntpd\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f64 VR256:$src), - addr:$dst)]>, VEX; - def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f64 VR256:$src), - addr:$dst)]>, VEX; + addr:$dst)], + IIC_SSE_MOVNT>, VEX; let ExeDomain = SSEPackedInt in def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f32 VR256:$src), - addr:$dst)]>, VEX; + [(alignednontemporalstore (v4i64 VR256:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX; } def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src), @@ -3145,19 +3346,18 @@ def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src), let AddedComplexity = 400 in { // Prefer non-temporal versions def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; + [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; - -def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>; + [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; let ExeDomain = SSEPackedInt in def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; + [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>; @@ -3165,11 +3365,13 @@ def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), // There is no AVX form for instructions below this point def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movnti{l}\t{$src, $dst|$dst, $src}", - [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, + [(nontemporalstore (i32 GR32:$src), addr:$dst)], + IIC_SSE_MOVNT>, TB, Requires<[HasSSE2]>; def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movnti{q}\t{$src, $dst|$dst, $src}", - [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, + [(nontemporalstore (i64 GR64:$src), addr:$dst)], + IIC_SSE_MOVNT>, TB, Requires<[HasSSE2]>; } @@ -3178,31 +3380,40 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), //===----------------------------------------------------------------------===// // Prefetch intrinsic. -def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src), - "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>; -def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src), - "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>; -def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src), - "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>; -def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src), - "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>; +let Predicates = [HasSSE1] in { +def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), + "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), + "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), + "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), + "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))], + IIC_SSE_PREFETCH>, TB; +} // Flush cache def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), - "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, - TB, Requires<[HasSSE2]>; + "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], + IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>; // Pause. This "instruction" is encoded as "rep; nop", so even though it // was introduced with SSE2, it's backward compatible. -def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP; +def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP; // Load, store, and memory fence def SFENCE : I<0xAE, MRM_F8, (outs), (ins), - "sfence", [(int_x86_sse_sfence)]>, TB, Requires<[HasSSE1]>; + "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>, + TB, Requires<[HasSSE1]>; def LFENCE : I<0xAE, MRM_E8, (outs), (ins), - "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>; + "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>, + TB, Requires<[HasSSE2]>; def MFENCE : I<0xAE, MRM_F0, (outs), (ins), - "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>; + "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>, + TB, Requires<[HasSSE2]>; def : Pat<(X86SFence), (SFENCE)>; def : Pat<(X86LFence), (LFENCE)>; @@ -3213,14 +3424,18 @@ def : Pat<(X86MFence), (MFENCE)>; //===----------------------------------------------------------------------===// def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX; + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, VEX; def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX; + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, VEX; def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>; + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>; def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>; + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>; //===---------------------------------------------------------------------===// // SSE2 - Move Aligned/Unaligned Packed Integer Instructions @@ -3230,108 +3445,134 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions let neverHasSideEffects = 1 in { def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, + VEX; def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, + VEX; } def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, + VEX; def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, + VEX; // For Disassembler let isCodeGenOnly = 1 in { def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, + VEX; def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, + VEX; def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqu\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, + VEX; def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqu\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, + VEX; } let canFoldAsLoad = 1, mayLoad = 1 in { def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, + VEX; def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, + VEX; let Predicates = [HasAVX] in { def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, + XS, VEX; def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, + XS, VEX; } } let mayStore = 1 in { def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, + VEX; def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, + VEX; let Predicates = [HasAVX] in { def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, + XS, VEX; def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, + XS, VEX; } } let neverHasSideEffects = 1 in def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>; + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - []>, XS, Requires<[HasSSE2]>; + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>; // For Disassembler let isCodeGenOnly = 1 in { def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>; + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - []>, XS, Requires<[HasSSE2]>; + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>; } let canFoldAsLoad = 1, mayLoad = 1 in { def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", - [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; + [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], + IIC_SSE_MOVA_P_RM>; def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqu\t{$src, $dst|$dst, $src}", - [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, + [/*(set VR128:$dst, (loadv2i64 addr:$src))*/], + IIC_SSE_MOVU_P_RM>, XS, Requires<[HasSSE2]>; } let mayStore = 1 in { def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", - [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; + [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], + IIC_SSE_MOVA_P_MR>; def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, + [/*(store (v2i64 VR128:$src), addr:$dst)*/], + IIC_SSE_MOVU_P_MR>, XS, Requires<[HasSSE2]>; } // Intrinsic forms of MOVDQU load and store def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "vmovdqu\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, + [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)], + IIC_SSE_MOVU_P_MR>, XS, VEX, Requires<[HasAVX]>; def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, + [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)], + IIC_SSE_MOVU_P_MR>, XS, Requires<[HasSSE2]>; } // ExeDomain = SSEPackedInt let Predicates = [HasAVX] in { - def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>; def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src), (VMOVDQUYmr addr:$dst, VR256:$src)>; } @@ -3340,178 +3581,326 @@ let Predicates = [HasAVX] in { // SSE2 - Packed Integer Arithmetic Instructions //===---------------------------------------------------------------------===// +def SSE_PMADD : OpndItins< + IIC_SSE_PMADD, IIC_SSE_PMADD +>; + let ExeDomain = SSEPackedInt in { // SSE integer instructions multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, - bit IsCommutable = 0, bit Is2Addr = 1> { + RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, + bit Is2Addr = 1> { let isCommutable = IsCommutable in - def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>; - def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), + [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId VR128:$src1, - (bitconvert (memopv2i64 addr:$src2))))]>; -} - -multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, - string OpcodeStr, Intrinsic IntId, - Intrinsic IntId2, bit Is2Addr = 1> { - def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), + [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))], + itins.rm>; +} + +multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, SDNode OpNode, + SDNode OpNode2, RegisterClass RC, + ValueType DstVT, ValueType SrcVT, PatFrag bc_frag, + ShiftOpndItins itins, + bit Is2Addr = 1> { + // src2 is always 128-bit + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, VR128:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>; - def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId VR128:$src1, - (bitconvert (memopv2i64 addr:$src2))))]>; - def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), - (ins VR128:$src1, i32i8imm:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>; -} - -/// PDI_binop_rm - Simple SSE2 binary operator. -multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, bit IsCommutable = 0, bit Is2Addr = 1> { - let isCommutable = IsCommutable in - def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), + [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))], + itins.rr>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, i128mem:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>; - def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), + [(set RC:$dst, (DstVT (OpNode RC:$src1, + (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>; + def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), + (ins RC:$src1, i32i8imm:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (OpVT (OpNode VR128:$src1, - (bitconvert (memopv2i64 addr:$src2)))))]>; + [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>; } -/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64. -/// -/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew -/// to collapse (bitconvert VT to VT) into its operand. -/// -multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode, - bit IsCommutable = 0, bit Is2Addr = 1> { +/// PDI_binop_rm - Simple SSE2 binary operator with different src and dst types +multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, bit Is2Addr = 1> { let isCommutable = IsCommutable in - def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]>; - def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (OpNode VR128:$src1, (memopv2i64 addr:$src2)))]>; + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))]>; } - } // ExeDomain = SSEPackedInt // 128-bit Integer Arithmetic let Predicates = [HasAVX] in { -defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V; -defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V; -defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V; -defm VPADDQ : PDI_binop_rm_v2i64<0xD4, "vpaddq", add, 1, 0>, VEX_4V; -defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, 1, 0>, VEX_4V; -defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, 0, 0>, VEX_4V; -defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, 0, 0>, VEX_4V; -defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, 0, 0>, VEX_4V; -defm VPSUBQ : PDI_binop_rm_v2i64<0xFB, "vpsubq", sub, 0, 0>, VEX_4V; +defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 1, 0 /*3addr*/>, + VEX_4V; +defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDQ : PDI_binop_rm<0xD4, "vpaddq", add, v2i64, VR128, memopv2i64, + i128mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V; +defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, VR128, memopv2i64, + i128mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; +defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64, + i128mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V; +defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, + memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; + +// Intrinsic forms +defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBSW : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPADDSB : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDSW : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w, + VR128, memopv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; +defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w, + VR128, memopv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; +defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd, + VR128, memopv2i64, i128mem, + SSE_PMADD, 1, 0>, VEX_4V; +defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPAVGW : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMINUB : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMINSW : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMAXUB : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMAXSW : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPSADBW : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { +defm VPADDBY : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64, + i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDWY : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64, + i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDDY : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64, + i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDQY : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64, + i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V; +defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64, + i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; +defm VPSUBBY : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64, + i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBWY : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64, + i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64, + i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64, + i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V; +defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, + VR256, memopv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; // Intrinsic forms -defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b, 0, 0>, - VEX_4V; -defm VPSUBSW : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w, 0, 0>, - VEX_4V; -defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b, 0, 0>, - VEX_4V; -defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w, 0, 0>, - VEX_4V; -defm VPADDSB : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b, 1, 0>, - VEX_4V; -defm VPADDSW : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w, 1, 0>, - VEX_4V; -defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b, 1, 0>, - VEX_4V; -defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w, 1, 0>, - VEX_4V; -defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w, 1, 0>, - VEX_4V; -defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w, 1, 0>, - VEX_4V; -defm VPMULUDQ : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_sse2_pmulu_dq, 1, 0>, - VEX_4V; -defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd, 1, 0>, - VEX_4V; -defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b, 1, 0>, - VEX_4V; -defm VPAVGW : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w, 1, 0>, - VEX_4V; -defm VPMINUB : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b, 1, 0>, - VEX_4V; -defm VPMINSW : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w, 1, 0>, - VEX_4V; -defm VPMAXUB : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b, 1, 0>, - VEX_4V; -defm VPMAXSW : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w, 1, 0>, - VEX_4V; -defm VPSADBW : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw, 1, 0>, - VEX_4V; +defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBSWY : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPADDSBY : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDSWY : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_avx2_padds_w, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDUSBY : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_avx2_paddus_b, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPADDUSWY : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_avx2_paddus_w, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w, + VR256, memopv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; +defm VPMULHWY : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w, + VR256, memopv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; +defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd, + VR256, memopv4i64, i256mem, + SSE_PMADD, 1, 0>, VEX_4V; +defm VPAVGBY : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPAVGWY : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_avx2_pavg_w, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMINUBY : PDI_binop_rm_int<0xDA, "vpminub", int_x86_avx2_pminu_b, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMINSWY : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_avx2_pmins_w, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMAXUBY : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_avx2_pmaxu_b, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPMAXSWY : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_avx2_pmaxs_w, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; +defm VPSADBWY : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_avx2_psad_bw, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; } let Constraints = "$src1 = $dst" in { -defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>; -defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>; -defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>; -defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>; -defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>; -defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>; -defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>; -defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>; -defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>; +defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 1>; +defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 1>; +defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P, 1>; +defm PADDQ : PDI_binop_rm<0xD4, "paddq", add, v2i64, VR128, memopv2i64, + i128mem, SSE_INTALUQ_ITINS_P, 1>; +defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, VR128, memopv2i64, + i128mem, SSE_INTMUL_ITINS_P, 1>; +defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P>; +defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P>; +defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P>; +defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64, + i128mem, SSE_INTALUQ_ITINS_P>; +defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, + memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>; // Intrinsic forms -defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>; -defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>; -defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>; -defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>; -defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>; -defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>; -defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>; -defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>; -defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>; -defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, 1>; -defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>; -defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>; -defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>; -defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>; -defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>; -defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>; -defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>; -defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>; -defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>; +defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; +defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; +defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; +defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; +defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, + VR128, memopv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1>; +defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, + VR128, memopv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1>; +defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, + VR128, memopv2i64, i128mem, + SSE_PMADD, 1>; +defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; +defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; } // Constraints = "$src1 = $dst" @@ -3520,145 +3909,176 @@ defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>; //===---------------------------------------------------------------------===// let Predicates = [HasAVX] in { -defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw", - int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>, - VEX_4V; -defm VPSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "vpslld", - int_x86_sse2_psll_d, int_x86_sse2_pslli_d, 0>, - VEX_4V; -defm VPSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "vpsllq", - int_x86_sse2_psll_q, int_x86_sse2_pslli_q, 0>, - VEX_4V; - -defm VPSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "vpsrlw", - int_x86_sse2_psrl_w, int_x86_sse2_psrli_w, 0>, - VEX_4V; -defm VPSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "vpsrld", - int_x86_sse2_psrl_d, int_x86_sse2_psrli_d, 0>, - VEX_4V; -defm VPSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "vpsrlq", - int_x86_sse2_psrl_q, int_x86_sse2_psrli_q, 0>, - VEX_4V; - -defm VPSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "vpsraw", - int_x86_sse2_psra_w, int_x86_sse2_psrai_w, 0>, - VEX_4V; -defm VPSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "vpsrad", - int_x86_sse2_psra_d, int_x86_sse2_psrai_d, 0>, - VEX_4V; - -defm VPAND : PDI_binop_rm_v2i64<0xDB, "vpand", and, 1, 0>, VEX_4V; -defm VPOR : PDI_binop_rm_v2i64<0xEB, "vpor" , or, 1, 0>, VEX_4V; -defm VPXOR : PDI_binop_rm_v2i64<0xEF, "vpxor", xor, 1, 0>, VEX_4V; +defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, + VR128, v2i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, + VR128, v2i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; let ExeDomain = SSEPackedInt in { - let neverHasSideEffects = 1 in { - // 128-bit logical shifts. - def VPSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), - "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - VEX_4V; - def VPSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), - "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - VEX_4V; - // PSRADQri doesn't exist in SSE[1-3]. - } - def VPANDNrr : PDI<0xDF, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}", + // 128-bit logical shifts. + def VPSLLDQri : PDIi8<0x73, MRM7r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>, + VEX_4V; + def VPSRLDQri : PDIi8<0x73, MRM3r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (v2i64 (X86andnp VR128:$src1, VR128:$src2)))]>,VEX_4V; + (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>, + VEX_4V; + // PSRADQri doesn't exist in SSE[1-3]. +} +} // Predicates = [HasAVX] + +let Predicates = [HasAVX2] in { +defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR256, v16i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, + VR256, v8i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, + VR256, v4i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR256, v16i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, + VR256, v8i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, + VR256, v4i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR256, v16i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, + VR256, v8i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; - def VPANDNrm : PDI<0xDF, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (X86andnp VR128:$src1, - (memopv2i64 addr:$src2)))]>, VEX_4V; -} -} +let ExeDomain = SSEPackedInt in { + // 256-bit logical shifts. + def VPSLLDQYri : PDIi8<0x73, MRM7r, + (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), + "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, + (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>, + VEX_4V; + def VPSRLDQYri : PDIi8<0x73, MRM3r, + (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), + "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, + (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>, + VEX_4V; + // PSRADQYri doesn't exist in SSE[1-3]. +} +} // Predicates = [HasAVX2] let Constraints = "$src1 = $dst" in { -defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", - int_x86_sse2_psll_w, int_x86_sse2_pslli_w>; -defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", - int_x86_sse2_psll_d, int_x86_sse2_pslli_d>; -defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", - int_x86_sse2_psll_q, int_x86_sse2_pslli_q>; - -defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", - int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>; -defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", - int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>; -defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", - int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>; - -defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", - int_x86_sse2_psra_w, int_x86_sse2_psrai_w>; -defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", - int_x86_sse2_psra_d, int_x86_sse2_psrai_d>; - -defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>; -defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or, 1>; -defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>; +defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P>; +defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P>; +defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, + VR128, v2i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P>; + +defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P>; +defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P>; +defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, + VR128, v2i64, v2i64, bc_v2i64, + SSE_INTSHIFT_ITINS_P>; + +defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, + SSE_INTSHIFT_ITINS_P>; +defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, + VR128, v4i32, v4i32, bc_v4i32, + SSE_INTSHIFT_ITINS_P>; let ExeDomain = SSEPackedInt in { - let neverHasSideEffects = 1 in { - // 128-bit logical shifts. - def PSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), - "pslldq\t{$src2, $dst|$dst, $src2}", []>; - def PSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), - "psrldq\t{$src2, $dst|$dst, $src2}", []>; - // PSRADQri doesn't exist in SSE[1-3]. - } - def PANDNrr : PDI<0xDF, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "pandn\t{$src2, $dst|$dst, $src2}", []>; - - def PANDNrm : PDI<0xDF, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "pandn\t{$src2, $dst|$dst, $src2}", []>; + // 128-bit logical shifts. + def PSLLDQri : PDIi8<0x73, MRM7r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "pslldq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>; + def PSRLDQri : PDIi8<0x73, MRM3r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "psrldq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>; + // PSRADQri doesn't exist in SSE[1-3]. } } // Constraints = "$src1 = $dst" let Predicates = [HasAVX] in { def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), - (v2i64 (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), - (v2i64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; - def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2), - (v2i64 (VPSLLDQri VR128:$src1, imm:$src2))>; - def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2), - (v2i64 (VPSRLDQri VR128:$src1, imm:$src2))>; + (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), - (v2f64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; // Shift up / down and insert zero's. - def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))), - (v2i64 (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>; - def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))), - (v2i64 (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>; + def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), + (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; + def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), + (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; +} + +let Predicates = [HasAVX2] in { + def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2), + (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; + def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), + (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; } let Predicates = [HasSSE2] in { def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), - (v2i64 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), - (v2i64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; - def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2), - (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>; - def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2), - (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>; + (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), - (v2f64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; // Shift up / down and insert zero's. - def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))), - (v2i64 (PSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>; - def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))), - (v2i64 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>; + def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), + (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; + def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), + (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; } //===---------------------------------------------------------------------===// @@ -3666,100 +4086,106 @@ let Predicates = [HasSSE2] in { //===---------------------------------------------------------------------===// let Predicates = [HasAVX] in { - defm VPCMPEQB : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1, - 0>, VEX_4V; - defm VPCMPEQW : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1, - 0>, VEX_4V; - defm VPCMPEQD : PDI_binop_rm_int<0x76, "vpcmpeqd", int_x86_sse2_pcmpeq_d, 1, - 0>, VEX_4V; - defm VPCMPGTB : PDI_binop_rm_int<0x64, "vpcmpgtb", int_x86_sse2_pcmpgt_b, 0, - 0>, VEX_4V; - defm VPCMPGTW : PDI_binop_rm_int<0x65, "vpcmpgtw", int_x86_sse2_pcmpgt_w, 0, - 0>, VEX_4V; - defm VPCMPGTD : PDI_binop_rm_int<0x66, "vpcmpgtd", int_x86_sse2_pcmpgt_d, 0, - 0>, VEX_4V; - - def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)), - (VPCMPEQBrr VR128:$src1, VR128:$src2)>; - def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))), - (VPCMPEQBrm VR128:$src1, addr:$src2)>; - def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)), - (VPCMPEQWrr VR128:$src1, VR128:$src2)>; - def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))), - (VPCMPEQWrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)), - (VPCMPEQDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))), - (VPCMPEQDrm VR128:$src1, addr:$src2)>; - - def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)), - (VPCMPGTBrr VR128:$src1, VR128:$src2)>; - def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))), - (VPCMPGTBrm VR128:$src1, addr:$src2)>; - def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)), - (VPCMPGTWrr VR128:$src1, VR128:$src2)>; - def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))), - (VPCMPGTWrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)), - (VPCMPGTDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), - (VPCMPGTDrm VR128:$src1, addr:$src2)>; + defm VPCMPEQB : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v16i8, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; + defm VPCMPEQW : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v8i16, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; + defm VPCMPEQD : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v4i32, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; + defm VPCMPGTB : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v16i8, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; + defm VPCMPGTW : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v8i16, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; + defm VPCMPGTD : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v4i32, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPCMPEQBY : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v32i8, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; + defm VPCMPEQWY : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v16i16, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; + defm VPCMPEQDY : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v8i32, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; + defm VPCMPGTBY : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v32i8, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; + defm VPCMPGTWY : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v16i16, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; + defm VPCMPGTDY : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v8i32, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; } let Constraints = "$src1 = $dst" in { - defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b, 1>; - defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w, 1>; - defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d, 1>; - defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>; - defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>; - defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>; + defm PCMPEQB : PDI_binop_rm<0x74, "pcmpeqb", X86pcmpeq, v16i8, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; + defm PCMPEQW : PDI_binop_rm<0x75, "pcmpeqw", X86pcmpeq, v8i16, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; + defm PCMPEQD : PDI_binop_rm<0x76, "pcmpeqd", X86pcmpeq, v4i32, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 1>; + defm PCMPGTB : PDI_binop_rm<0x64, "pcmpgtb", X86pcmpgt, v16i8, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; + defm PCMPGTW : PDI_binop_rm<0x65, "pcmpgtw", X86pcmpgt, v8i16, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; + defm PCMPGTD : PDI_binop_rm<0x66, "pcmpgtd", X86pcmpgt, v4i32, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; } // Constraints = "$src1 = $dst" -let Predicates = [HasSSE2] in { - def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)), - (PCMPEQBrr VR128:$src1, VR128:$src2)>; - def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))), - (PCMPEQBrm VR128:$src1, addr:$src2)>; - def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)), - (PCMPEQWrr VR128:$src1, VR128:$src2)>; - def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))), - (PCMPEQWrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)), - (PCMPEQDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))), - (PCMPEQDrm VR128:$src1, addr:$src2)>; - - def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)), - (PCMPGTBrr VR128:$src1, VR128:$src2)>; - def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))), - (PCMPGTBrm VR128:$src1, addr:$src2)>; - def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)), - (PCMPGTWrr VR128:$src1, VR128:$src2)>; - def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))), - (PCMPGTWrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)), - (PCMPGTDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), - (PCMPGTDrm VR128:$src1, addr:$src2)>; -} - //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Pack Instructions //===---------------------------------------------------------------------===// let Predicates = [HasAVX] in { defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128, - 0, 0>, VEX_4V; + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128, - 0, 0>, VEX_4V; + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128, - 0, 0>, VEX_4V; + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { +defm VPACKSSWBY : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_avx2_packsswb, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPACKSSDWY : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_avx2_packssdw, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; +defm VPACKUSWBY : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_avx2_packuswb, + VR256, memopv4i64, i256mem, + SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; } let Constraints = "$src1 = $dst" in { -defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>; -defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>; -defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>; +defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; +defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; +defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128, + VR128, memopv2i64, i128mem, + SSE_INTALU_ITINS_P>; } // Constraints = "$src1 = $dst" //===---------------------------------------------------------------------===// @@ -3767,103 +4193,75 @@ defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>; //===---------------------------------------------------------------------===// let ExeDomain = SSEPackedInt in { -multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, PatFrag pshuf_frag, - PatFrag bc_frag> { +multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, SDNode OpNode> { def ri : Ii8<0x70, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (vt (OpNode VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF>; +def mi : Ii8<0x70, MRMSrcMem, + (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt (OpNode (bitconvert (memopv2i64 addr:$src1)), + (i8 imm:$src2))))], + IIC_SSE_PSHUF>; +} + +multiclass sse2_pshuffle_y<string OpcodeStr, ValueType vt, SDNode OpNode> { +def Yri : Ii8<0x70, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (vt (pshuf_frag:$src2 VR128:$src1, - (undef))))]>; -def mi : Ii8<0x70, MRMSrcMem, - (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), + [(set VR256:$dst, (vt (OpNode VR256:$src1, (i8 imm:$src2))))]>; +def Ymi : Ii8<0x70, MRMSrcMem, + (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (vt (pshuf_frag:$src2 - (bc_frag (memopv2i64 addr:$src1)), - (undef))))]>; + [(set VR256:$dst, + (vt (OpNode (bitconvert (memopv4i64 addr:$src1)), + (i8 imm:$src2))))]>; } } // ExeDomain = SSEPackedInt let Predicates = [HasAVX] in { - let AddedComplexity = 5 in - defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize, - VEX; - - // SSE2 with ImmT == Imm8 and XS prefix. - defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, pshufhw, bc_v8i16>, XS, - VEX; - - // SSE2 with ImmT == Imm8 and XD prefix. - defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, pshuflw, bc_v8i16>, XD, - VEX; - - let AddedComplexity = 5 in - def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))), - (VPSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>; - // Unary v4f32 shuffle with VPSHUF* in order to fold a load. - def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)), - (VPSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>; - - def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)), - (i8 imm:$imm))), - (VPSHUFDmi addr:$src1, imm:$imm)>; - def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)), - (i8 imm:$imm))), - (VPSHUFDmi addr:$src1, imm:$imm)>; - def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (VPSHUFDri VR128:$src1, imm:$imm)>; - def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (VPSHUFDri VR128:$src1, imm:$imm)>; - def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))), - (VPSHUFHWri VR128:$src, imm:$imm)>; - def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)), - (i8 imm:$imm))), - (VPSHUFHWmi addr:$src, imm:$imm)>; - def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))), - (VPSHUFLWri VR128:$src, imm:$imm)>; - def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)), - (i8 imm:$imm))), - (VPSHUFLWmi addr:$src, imm:$imm)>; + let AddedComplexity = 5 in + defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, X86PShufd>, TB, OpSize, VEX; + + // SSE2 with ImmT == Imm8 and XS prefix. + defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, X86PShufhw>, XS, VEX; + + // SSE2 with ImmT == Imm8 and XD prefix. + defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, X86PShuflw>, XD, VEX; + + def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), + (VPSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (VPSHUFDri VR128:$src1, imm:$imm)>; +} + +let Predicates = [HasAVX2] in { + defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>, TB, OpSize, VEX; + defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>, XS, VEX; + defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX; } let Predicates = [HasSSE2] in { - let AddedComplexity = 5 in - defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize; - - // SSE2 with ImmT == Imm8 and XS prefix. - defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, pshufhw, bc_v8i16>, XS; - - // SSE2 with ImmT == Imm8 and XD prefix. - defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, pshuflw, bc_v8i16>, XD; - - let AddedComplexity = 5 in - def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))), - (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>; - // Unary v4f32 shuffle with PSHUF* in order to fold a load. - def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)), - (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>; - - def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)), - (i8 imm:$imm))), - (PSHUFDmi addr:$src1, imm:$imm)>; - def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)), - (i8 imm:$imm))), - (PSHUFDmi addr:$src1, imm:$imm)>; - def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (PSHUFDri VR128:$src1, imm:$imm)>; - def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (PSHUFDri VR128:$src1, imm:$imm)>; - def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))), - (PSHUFHWri VR128:$src, imm:$imm)>; - def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)), - (i8 imm:$imm))), - (PSHUFHWmi addr:$src, imm:$imm)>; - def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))), - (PSHUFLWri VR128:$src, imm:$imm)>; - def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)), - (i8 imm:$imm))), - (PSHUFLWmi addr:$src, imm:$imm)>; + let AddedComplexity = 5 in + defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize; + + // SSE2 with ImmT == Imm8 and XS prefix. + defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, X86PShufhw>, XS; + + // SSE2 with ImmT == Imm8 and XD prefix. + defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, X86PShuflw>, XD; + + def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), + (PSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (PSHUFDri VR128:$src1, imm:$imm)>; } //===---------------------------------------------------------------------===// @@ -3878,7 +4276,8 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, !if(Is2Addr, !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))]>; + [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], + IIC_SSE_UNPCK>; def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !if(Is2Addr, @@ -3886,96 +4285,104 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (OpNode VR128:$src1, (bc_frag (memopv2i64 - addr:$src2))))]>; + addr:$src2))))], + IIC_SSE_UNPCK>; +} + +multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>; + def Yrm : PDI<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (OpNode VR256:$src1, + (bc_frag (memopv4i64 addr:$src2))))]>; } let Predicates = [HasAVX] in { - defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Punpcklbw, + defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, bc_v16i8, 0>, VEX_4V; - defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Punpcklwd, + defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, bc_v8i16, 0>, VEX_4V; - defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Punpckldq, + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, bc_v4i32, 0>, VEX_4V; + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, + bc_v2i64, 0>, VEX_4V; - /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen - /// knew to collapse (bitconvert VT to VT) into its operand. - def VPUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1, - VR128:$src2)))]>, VEX_4V; - def VPUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1, - (memopv2i64 addr:$src2))))]>, VEX_4V; - - defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Punpckhbw, + defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, bc_v16i8, 0>, VEX_4V; - defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Punpckhwd, + defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, bc_v8i16, 0>, VEX_4V; - defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Punpckhdq, + defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, bc_v4i32, 0>, VEX_4V; - - /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen - /// knew to collapse (bitconvert VT to VT) into its operand. - def VPUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1, - VR128:$src2)))]>, VEX_4V; - def VPUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1, - (memopv2i64 addr:$src2))))]>, VEX_4V; + defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, + bc_v2i64, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, + bc_v32i8>, VEX_4V; + defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, + bc_v16i16>, VEX_4V; + defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, + bc_v8i32>, VEX_4V; + defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, + bc_v4i64>, VEX_4V; + + defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, + bc_v32i8>, VEX_4V; + defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, + bc_v16i16>, VEX_4V; + defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, + bc_v8i32>, VEX_4V; + defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, + bc_v4i64>, VEX_4V; } let Constraints = "$src1 = $dst" in { - defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw, bc_v16i8>; - defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd, bc_v8i16>; - defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq, bc_v4i32>; - - /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen - /// knew to collapse (bitconvert VT to VT) into its operand. - def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "punpcklqdq\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, - (v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)))]>; - def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "punpcklqdq\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, - (v2i64 (X86Punpcklqdq VR128:$src1, - (memopv2i64 addr:$src2))))]>; - - defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw, bc_v16i8>; - defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd, bc_v8i16>; - defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq, bc_v4i32>; - - /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen - /// knew to collapse (bitconvert VT to VT) into its operand. - def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "punpckhqdq\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, - (v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)))]>; - def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "punpckhqdq\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, - (v2i64 (X86Punpckhqdq VR128:$src1, - (memopv2i64 addr:$src2))))]>; + defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, + bc_v16i8>; + defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, + bc_v8i16>; + defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, + bc_v4i32>; + defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, + bc_v2i64>; + + defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, + bc_v16i8>; + defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, + bc_v8i16>; + defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, + bc_v4i32>; + defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, + bc_v2i64>; } } // ExeDomain = SSEPackedInt -// Splat v2f64 / v2i64 -let AddedComplexity = 10 in { - def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), - (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; - def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), - (VPUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasAVX]>; +// Patterns for using AVX1 instructions with integer vectors +// Here to give AVX2 priority +let Predicates = [HasAVX] in { + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; } //===---------------------------------------------------------------------===// @@ -3991,7 +4398,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>; + (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>; def rmi : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, i32i8imm:$src3), @@ -4000,7 +4407,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), - imm:$src3))]>; + imm:$src3))], IIC_SSE_PINSRW>; } // Extract @@ -4014,7 +4421,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), - imm:$src2))]>; + imm:$src2))], IIC_SSE_PEXTRW>; // Insert let Predicates = [HasAVX] in { @@ -4038,12 +4445,23 @@ let ExeDomain = SSEPackedInt in { def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX; + [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + IIC_SSE_MOVMSK>, VEX; def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX; + +let Predicates = [HasAVX2] in { +def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX; +def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX; +} + def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>; + [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + IIC_SSE_MOVMSK>; } // ExeDomain = SSEPackedInt @@ -4057,21 +4475,25 @@ let Uses = [EDI] in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", - [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, VEX; + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], + IIC_SSE_MASKMOV>, VEX; let Uses = [RDI] in def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", - [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX; + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], + IIC_SSE_MASKMOV>, VEX; let Uses = [EDI] in def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", - [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], + IIC_SSE_MASKMOV>; let Uses = [RDI] in def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", - [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], + IIC_SSE_MASKMOV>; } // ExeDomain = SSEPackedInt @@ -4085,54 +4507,65 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (scalar_to_vector GR32:$src)))]>, VEX; + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, + VEX; def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>, VEX; def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2i64 (scalar_to_vector GR64:$src)))]>, VEX; + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>, VEX; def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert GR64:$src))]>, VEX; + [(set FR64:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>, VEX; def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (scalar_to_vector GR32:$src)))]>; + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>; def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>; + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>; def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2i64 (scalar_to_vector GR64:$src)))]>; + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>; def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert GR64:$src))]>; + [(set FR64:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>; //===---------------------------------------------------------------------===// // Move Int Doubleword to Single Scalar // def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX; + [(set FR32:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, VEX; def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, VEX; def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert GR32:$src))]>; + [(set FR32:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>; def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>; + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>; //===---------------------------------------------------------------------===// // Move Packed Doubleword Int to Packed Double Int @@ -4140,20 +4573,22 @@ def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), - (iPTR 0)))]>, VEX; + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX; def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), - (iPTR 0))), addr:$dst)]>, VEX; + (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, + VEX; def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), - (iPTR 0)))]>; + (iPTR 0)))], IIC_SSE_MOVD_ToGP>; def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), - (iPTR 0))), addr:$dst)]>; + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>; //===---------------------------------------------------------------------===// // Move Packed Doubleword Int first element to Doubleword Int @@ -4161,13 +4596,15 @@ def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), - (iPTR 0)))]>, + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>, TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>; def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), - (iPTR 0)))]>; + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>; //===---------------------------------------------------------------------===// // Bitcast FR64 <-> GR64 @@ -4179,36 +4616,45 @@ def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), VEX; def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (bitconvert FR64:$src))]>; + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVDQ>, VEX; def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>; + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, VEX; def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>; + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], + IIC_SSE_MOVDQ>; def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (bitconvert FR64:$src))]>; + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVD_ToGP>; def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>; + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>; //===---------------------------------------------------------------------===// // Move Scalar Single to Double Int // def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX; + [(set GR32:$dst, (bitconvert FR32:$src))], + IIC_SSE_MOVD_ToGP>, VEX; def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX; + [(store (i32 (bitconvert FR32:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, VEX; def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (bitconvert FR32:$src))]>; + [(set GR32:$dst, (bitconvert FR32:$src))], + IIC_SSE_MOVD_ToGP>; def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>; + [(store (i32 (bitconvert FR32:$src)), addr:$dst)], + IIC_SSE_MOVDQ>; //===---------------------------------------------------------------------===// // Patterns and instructions to describe movd/movq to XMM register zero-extends @@ -4217,23 +4663,26 @@ let AddedComplexity = 15 in { def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector GR32:$src)))))]>, - VEX; + (v4i32 (scalar_to_vector GR32:$src)))))], + IIC_SSE_MOVDQ>, VEX; def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only [(set VR128:$dst, (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector GR64:$src)))))]>, + (v2i64 (scalar_to_vector GR64:$src)))))], + IIC_SSE_MOVDQ>, VEX, VEX_W; } let AddedComplexity = 15 in { def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector GR32:$src)))))]>; + (v4i32 (scalar_to_vector GR32:$src)))))], + IIC_SSE_MOVDQ>; def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only [(set VR128:$dst, (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector GR64:$src)))))]>; + (v2i64 (scalar_to_vector GR64:$src)))))], + IIC_SSE_MOVDQ>; } let AddedComplexity = 20 in { @@ -4241,29 +4690,19 @@ def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl (v4i32 (scalar_to_vector - (loadi32 addr:$src))))))]>, - VEX; + (loadi32 addr:$src))))))], + IIC_SSE_MOVDQ>, VEX; def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl (v4i32 (scalar_to_vector - (loadi32 addr:$src))))))]>; -} - -let Predicates = [HasSSE2], AddedComplexity = 20 in { - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (MOVZDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), - (MOVZDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), - (MOVZDI2PDIrm addr:$src)>; + (loadi32 addr:$src))))))], + IIC_SSE_MOVDQ>; } let Predicates = [HasAVX] in { // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. let AddedComplexity = 20 in { - def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), - (VMOVZDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (VMOVZDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), @@ -4278,6 +4717,13 @@ let Predicates = [HasAVX] in { (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; } +let Predicates = [HasSSE2], AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (MOVZDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (MOVZDI2PDIrm addr:$src)>; +} + // These are the correct encodings of the instructions so that we know how to // read correct assembly, even though we continue to emit the wrong ones for // compatibility with Darwin's buggy assembler. @@ -4309,7 +4755,8 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))], + IIC_SSE_MOVDQ>, XS, Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix //===---------------------------------------------------------------------===// @@ -4318,11 +4765,13 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))), addr:$dst)]>, VEX; + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>, VEX; def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), - (iPTR 0))), addr:$dst)]>; + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>; //===---------------------------------------------------------------------===// // Store / copy lower 64-bits of a XMM register. @@ -4332,14 +4781,16 @@ def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX; def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>; + [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)], + IIC_SSE_MOVDQ>; let AddedComplexity = 20 in def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector - (loadi64 addr:$src))))))]>, + (loadi64 addr:$src))))))], + IIC_SSE_MOVDQ>, XS, VEX, Requires<[HasAVX]>; let AddedComplexity = 20 in @@ -4347,9 +4798,19 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector - (loadi64 addr:$src))))))]>, + (loadi64 addr:$src))))))], + IIC_SSE_MOVDQ>, XS, Requires<[HasSSE2]>; +let Predicates = [HasAVX], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), + (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), + (VMOVZQI2PQIrm addr:$src)>; +} + let Predicates = [HasSSE2], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (MOVZQI2PQIrm addr:$src)>; @@ -4358,13 +4819,11 @@ let Predicates = [HasSSE2], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; } -let Predicates = [HasAVX], AddedComplexity = 20 in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVZQI2PQIrm addr:$src)>; - def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), - (VMOVZQI2PQIrm addr:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), - (VMOVZQI2PQIrm addr:$src)>; +let Predicates = [HasAVX] in { +def : Pat<(v4i64 (alignedX86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; +def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>; } //===---------------------------------------------------------------------===// @@ -4374,51 +4833,58 @@ let Predicates = [HasAVX], AddedComplexity = 20 in { let AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], + IIC_SSE_MOVQ_RR>, XS, VEX, Requires<[HasAVX]>; let AddedComplexity = 15 in def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], + IIC_SSE_MOVQ_RR>, XS, Requires<[HasSSE2]>; let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl - (loadv2i64 addr:$src))))]>, + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, XS, VEX, Requires<[HasAVX]>; let AddedComplexity = 20 in { def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl - (loadv2i64 addr:$src))))]>, + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, XS, Requires<[HasSSE2]>; } let AddedComplexity = 20 in { - let Predicates = [HasSSE2] in { - def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))), - (MOVZPQILo2PQIrm addr:$src)>; - def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), - (MOVZPQILo2PQIrr VR128:$src)>; - } let Predicates = [HasAVX] in { - def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))), + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (VMOVZPQILo2PQIrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (VMOVZPQILo2PQIrr VR128:$src)>; } + let Predicates = [HasSSE2] in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (MOVZPQILo2PQIrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (MOVZPQILo2PQIrr VR128:$src)>; + } } // Instructions to match in the assembler def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; + "movq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVDQ>, VEX, VEX_W; def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; + "movq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVDQ>, VEX, VEX_W; // Recognize "movd" with GR64 destination, but encode as a "movq" def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; + "movd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVDQ>, VEX, VEX_W; // Instructions for the disassembler // xr = XMM register @@ -4428,7 +4894,7 @@ let Predicates = [HasAVX] in def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movq\t{$src, $dst|$dst, $src}", []>, XS; + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS; //===---------------------------------------------------------------------===// // SSE3 - Conversion Instructions @@ -4458,14 +4924,16 @@ def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), } def CVTPD2DQrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; + "cvtpd2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>; def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; + "cvtpd2dq\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>; def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), - (VCVTPD2DQYrr VR256:$src)>; + (VCVTTPD2DQYrr VR256:$src)>; def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))), - (VCVTPD2DQYrm addr:$src)>; + (VCVTTPD2DQYrm addr:$src)>; // Convert Packed DW Integers to Packed Double FP let Predicates = [HasAVX] in { @@ -4480,14 +4948,16 @@ def VCVTDQ2PDYrr : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), } def CVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtdq2pd\t{$src, $dst|$dst, $src}", []>; + "cvtdq2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>; def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtdq2pd\t{$src, $dst|$dst, $src}", []>; + "cvtdq2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RM>; // AVX 256-bit register conversion intrinsics def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src), (VCVTDQ2PDYrr VR128:$src)>; -def : Pat<(int_x86_avx_cvtdq2_pd_256 (memopv4i32 addr:$src)), +def : Pat<(int_x86_avx_cvtdq2_pd_256 (bitconvert (memopv2i64 addr:$src))), (VCVTDQ2PDYrm addr:$src)>; def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src), @@ -4497,7 +4967,7 @@ def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)), def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PDYrr VR128:$src)>; -def : Pat<(v4f64 (sint_to_fp (memopv4i32 addr:$src))), +def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), (VCVTDQ2PDYrm addr:$src)>; //===---------------------------------------------------------------------===// @@ -4508,10 +4978,12 @@ multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, X86MemOperand x86memop> { def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (vt (OpNode RC:$src)))]>; + [(set RC:$dst, (vt (OpNode RC:$src)))], + IIC_SSE_MOV_LH>; def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>; + [(set RC:$dst, (OpNode (mem_frag addr:$src)))], + IIC_SSE_MOV_LH>; } let Predicates = [HasAVX] in { @@ -4529,17 +5001,6 @@ defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, memopv4f32, f128mem>; -let Predicates = [HasSSE3] in { - def : Pat<(v4i32 (X86Movshdup VR128:$src)), - (MOVSHDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), - (MOVSHDUPrm addr:$src)>; - def : Pat<(v4i32 (X86Movsldup VR128:$src)), - (MOVSLDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), - (MOVSLDUPrm addr:$src)>; -} - let Predicates = [HasAVX] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (VMOVSHDUPrr VR128:$src)>; @@ -4559,82 +5020,60 @@ let Predicates = [HasAVX] in { (VMOVSLDUPYrm addr:$src)>; } +let Predicates = [HasSSE3] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (MOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (MOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSLDUPrm addr:$src)>; +} + //===---------------------------------------------------------------------===// // SSE3 - Replicate Double FP - MOVDDUP //===---------------------------------------------------------------------===// multiclass sse3_replicate_dfp<string OpcodeStr> { +let neverHasSideEffects = 1 in def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>; + [], IIC_SSE_MOV_LH>; def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)), - (undef))))]>; + (v2f64 (X86Movddup + (scalar_to_vector (loadf64 addr:$src)))))], + IIC_SSE_MOV_LH>; } // FIXME: Merge with above classe when there're patterns for the ymm version multiclass sse3_replicate_dfp_y<string OpcodeStr> { +def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (v4f64 (X86Movddup + (scalar_to_vector (loadf64 addr:$src)))))]>; +} + let Predicates = [HasAVX] in { - def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>; - def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>; - } + defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; + defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX; } defm MOVDDUP : sse3_replicate_dfp<"movddup">; -defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; -defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX; - -let Predicates = [HasSSE3] in { - def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))), - (undef)), - (MOVDDUPrm addr:$src)>; - let AddedComplexity = 5 in { - def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>; - def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)), - (MOVDDUPrm addr:$src)>; - def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>; - def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)), - (MOVDDUPrm addr:$src)>; - } - def : Pat<(X86Movddup (memopv2f64 addr:$src)), - (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), - (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), - (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))), - (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (bc_v2f64 - (v2i64 (scalar_to_vector (loadi64 addr:$src))))), - (MOVDDUPrm addr:$src)>; -} let Predicates = [HasAVX] in { - def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))), - (undef)), - (VMOVDDUPrm addr:$src)>; - let AddedComplexity = 5 in { - def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>; - def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)), - (VMOVDDUPrm addr:$src)>; - def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>; - def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)), - (VMOVDDUPrm addr:$src)>; - } def : Pat<(X86Movddup (memopv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))), - (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; @@ -4644,16 +5083,24 @@ let Predicates = [HasAVX] in { (VMOVDDUPYrm addr:$src)>; def : Pat<(X86Movddup (memopv4i64 addr:$src)), (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4f64 (scalar_to_vector (loadf64 addr:$src)))), - (VMOVDDUPYrm addr:$src)>; def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4f64 VR256:$src)), - (VMOVDDUPYrr VR256:$src)>; def : Pat<(X86Movddup (v4i64 VR256:$src)), (VMOVDDUPYrr VR256:$src)>; } +let Predicates = [HasSSE3] in { + def : Pat<(X86Movddup (memopv2f64 addr:$src)), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (MOVDDUPrm addr:$src)>; +} + //===---------------------------------------------------------------------===// // SSE3 - Move Unaligned Integer //===---------------------------------------------------------------------===// @@ -4668,45 +5115,51 @@ let Predicates = [HasAVX] in { } def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "lddqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>; + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))], + IIC_SSE_LDDQU>; //===---------------------------------------------------------------------===// // SSE3 - Arithmetic //===---------------------------------------------------------------------===// multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop, bit Is2Addr = 1> { + X86MemOperand x86memop, OpndItins itins, + bit Is2Addr = 1> { def rr : I<0xD0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, RC:$src2))]>; + [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>; def rm : I<0xD0, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>; + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>; } -let Predicates = [HasAVX], - ExeDomain = SSEPackedDouble in { - defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, - f128mem, 0>, TB, XD, VEX_4V; - defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, - f128mem, 0>, TB, OpSize, VEX_4V; - defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, - f256mem, 0>, TB, XD, VEX_4V; - defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, - f256mem, 0>, TB, OpSize, VEX_4V; -} -let Constraints = "$src1 = $dst", Predicates = [HasSSE3], - ExeDomain = SSEPackedDouble in { +let Predicates = [HasAVX] in { + let ExeDomain = SSEPackedSingle in { + defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, + f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V; + defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, + f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V; + } + let ExeDomain = SSEPackedDouble in { + defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, + f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V; + defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, + f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V; + } +} +let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in { + let ExeDomain = SSEPackedSingle in defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, - f128mem>, TB, XD; + f128mem, SSE_ALU_F32P>, TB, XD; + let ExeDomain = SSEPackedDouble in defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, - f128mem>, TB, OpSize; + f128mem, SSE_ALU_F64P>, TB, OpSize; } //===---------------------------------------------------------------------===// @@ -4720,13 +5173,14 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>; def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>; + [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], + IIC_SSE_HADDSUB_RM>; } multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { @@ -4734,39 +5188,48 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>; def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>; + [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], + IIC_SSE_HADDSUB_RM>; } let Predicates = [HasAVX] in { - defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, - X86fhadd, 0>, VEX_4V; - defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, - X86fhadd, 0>, VEX_4V; - defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, - X86fhsub, 0>, VEX_4V; - defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, - X86fhsub, 0>, VEX_4V; - defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, - X86fhadd, 0>, VEX_4V; - defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, - X86fhadd, 0>, VEX_4V; - defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, - X86fhsub, 0>, VEX_4V; - defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, - X86fhsub, 0>, VEX_4V; + let ExeDomain = SSEPackedSingle in { + defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, + X86fhadd, 0>, VEX_4V; + defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, + X86fhsub, 0>, VEX_4V; + defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, + X86fhadd, 0>, VEX_4V; + defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, + X86fhsub, 0>, VEX_4V; + } + let ExeDomain = SSEPackedDouble in { + defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, + X86fhadd, 0>, VEX_4V; + defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, + X86fhsub, 0>, VEX_4V; + defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, + X86fhadd, 0>, VEX_4V; + defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, + X86fhsub, 0>, VEX_4V; + } } let Constraints = "$src1 = $dst" in { - defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>; - defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>; - defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>; - defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>; + let ExeDomain = SSEPackedSingle in { + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>; + } + let ExeDomain = SSEPackedDouble in { + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>; + } } //===---------------------------------------------------------------------===// @@ -4776,11 +5239,11 @@ let Constraints = "$src1 = $dst" in { /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, - PatFrag mem_frag128, Intrinsic IntId128> { + Intrinsic IntId128> { def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId128 VR128:$src))]>, + [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>, OpSize; def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), @@ -4788,32 +5251,101 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId128 - (bitconvert (mem_frag128 addr:$src))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>, + OpSize; +} + +/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256> { + def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId256 VR256:$src))]>, + OpSize; + + def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (IntId256 + (bitconvert (memopv4i64 addr:$src))))]>, OpSize; } let Predicates = [HasAVX] in { - defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8, + defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128>, VEX; - defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16, + defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128>, VEX; - defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", memopv4i32, + defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128>, VEX; } -defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", memopv16i8, +let Predicates = [HasAVX2] in { + defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", + int_x86_avx2_pabs_b>, VEX; + defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", + int_x86_avx2_pabs_w>, VEX; + defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", + int_x86_avx2_pabs_d>, VEX; +} + +defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128>; -defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", memopv8i16, +defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128>; -defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", memopv4i32, +defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128>; //===---------------------------------------------------------------------===// // SSSE3 - Packed Binary Operator Instructions //===---------------------------------------------------------------------===// +def SSE_PHADDSUBD : OpndItins< + IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM +>; +def SSE_PHADDSUBSW : OpndItins< + IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM +>; +def SSE_PHADDSUBW : OpndItins< + IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM +>; +def SSE_PSHUFB : OpndItins< + IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM +>; +def SSE_PSIGN : OpndItins< + IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM +>; +def SSE_PMULHRSW : OpndItins< + IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW +>; + +/// SS3I_binop_rm - Simple SSSE3 bin op +multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, OpndItins itins, + bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, + OpSize; + def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize; +} + /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, - PatFrag mem_frag128, Intrinsic IntId128, + Intrinsic IntId128, OpndItins itins, bit Is2Addr = 1> { let isCommutable = 1 in def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), @@ -4830,94 +5362,134 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; +} + +multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256> { + let isCommutable = 1 in + def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, + OpSize; + def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (IntId256 VR256:$src1, + (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; } let ImmT = NoImm, Predicates = [HasAVX] in { let isCommutable = 0 in { - defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16, - int_x86_ssse3_phadd_w_128, 0>, VEX_4V; - defm VPHADDD : SS3I_binop_rm_int<0x02, "vphaddd", memopv4i32, - int_x86_ssse3_phadd_d_128, 0>, VEX_4V; - defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", memopv8i16, - int_x86_ssse3_phadd_sw_128, 0>, VEX_4V; - defm VPHSUBW : SS3I_binop_rm_int<0x05, "vphsubw", memopv8i16, - int_x86_ssse3_phsub_w_128, 0>, VEX_4V; - defm VPHSUBD : SS3I_binop_rm_int<0x06, "vphsubd", memopv4i32, - int_x86_ssse3_phsub_d_128, 0>, VEX_4V; - defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", memopv8i16, - int_x86_ssse3_phsub_sw_128, 0>, VEX_4V; - defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", memopv16i8, - int_x86_ssse3_pmadd_ub_sw_128, 0>, VEX_4V; - defm VPSHUFB : SS3I_binop_rm_int<0x00, "vpshufb", memopv16i8, - int_x86_ssse3_pshuf_b_128, 0>, VEX_4V; - defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", memopv16i8, - int_x86_ssse3_psign_b_128, 0>, VEX_4V; - defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", memopv8i16, - int_x86_ssse3_psign_w_128, 0>, VEX_4V; - defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", memopv4i32, - int_x86_ssse3_psign_d_128, 0>, VEX_4V; -} -defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv8i16, - int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V; + defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128, + memopv2i64, i128mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128, + memopv2i64, i128mem, + SSE_PHADDSUBD, 0>, VEX_4V; + defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128, + memopv2i64, i128mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128, + memopv2i64, i128mem, + SSE_PHADDSUBD, 0>, VEX_4V; + defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128, + memopv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128, + memopv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128, + memopv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128, + memopv2i64, i128mem, + SSE_PSHUFB, 0>, VEX_4V; + defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", + int_x86_ssse3_phadd_sw_128, + SSE_PHADDSUBSW, 0>, VEX_4V; + defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", + int_x86_ssse3_phsub_sw_128, + SSE_PHADDSUBSW, 0>, VEX_4V; + defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", + int_x86_ssse3_pmadd_ub_sw_128, + SSE_PMADD, 0>, VEX_4V; +} +defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", + int_x86_ssse3_pmul_hr_sw_128, + SSE_PMULHRSW, 0>, VEX_4V; +} + +let ImmT = NoImm, Predicates = [HasAVX2] in { +let isCommutable = 0 in { + defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, + memopv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", + int_x86_avx2_phadd_sw>, VEX_4V; + defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", + int_x86_avx2_phsub_sw>, VEX_4V; + defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", + int_x86_avx2_pmadd_ub_sw>, VEX_4V; +} +defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", + int_x86_avx2_pmul_hr_sw>, VEX_4V; } // None of these have i8 immediate fields. let ImmT = NoImm, Constraints = "$src1 = $dst" in { let isCommutable = 0 in { - defm PHADDW : SS3I_binop_rm_int<0x01, "phaddw", memopv8i16, - int_x86_ssse3_phadd_w_128>; - defm PHADDD : SS3I_binop_rm_int<0x02, "phaddd", memopv4i32, - int_x86_ssse3_phadd_d_128>; - defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", memopv8i16, - int_x86_ssse3_phadd_sw_128>; - defm PHSUBW : SS3I_binop_rm_int<0x05, "phsubw", memopv8i16, - int_x86_ssse3_phsub_w_128>; - defm PHSUBD : SS3I_binop_rm_int<0x06, "phsubd", memopv4i32, - int_x86_ssse3_phsub_d_128>; - defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", memopv8i16, - int_x86_ssse3_phsub_sw_128>; - defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", memopv16i8, - int_x86_ssse3_pmadd_ub_sw_128>; - defm PSHUFB : SS3I_binop_rm_int<0x00, "pshufb", memopv16i8, - int_x86_ssse3_pshuf_b_128>; - defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", memopv16i8, - int_x86_ssse3_psign_b_128>; - defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", memopv8i16, - int_x86_ssse3_psign_w_128>; - defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", memopv4i32, - int_x86_ssse3_psign_d_128>; -} -defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv8i16, - int_x86_ssse3_pmul_hr_sw_128>; -} - -let Predicates = [HasSSSE3] in { - def : Pat<(X86pshufb VR128:$src, VR128:$mask), - (PSHUFBrr128 VR128:$src, VR128:$mask)>; - def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))), - (PSHUFBrm128 VR128:$src, addr:$mask)>; - - def : Pat<(X86psignb VR128:$src1, VR128:$src2), - (PSIGNBrr128 VR128:$src1, VR128:$src2)>; - def : Pat<(X86psignw VR128:$src1, VR128:$src2), - (PSIGNWrr128 VR128:$src1, VR128:$src2)>; - def : Pat<(X86psignd VR128:$src1, VR128:$src2), - (PSIGNDrr128 VR128:$src1, VR128:$src2)>; -} - -let Predicates = [HasAVX] in { - def : Pat<(X86pshufb VR128:$src, VR128:$mask), - (VPSHUFBrr128 VR128:$src, VR128:$mask)>; - def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))), - (VPSHUFBrm128 VR128:$src, addr:$mask)>; - - def : Pat<(X86psignb VR128:$src1, VR128:$src2), - (VPSIGNBrr128 VR128:$src1, VR128:$src2)>; - def : Pat<(X86psignw VR128:$src1, VR128:$src2), - (VPSIGNWrr128 VR128:$src1, VR128:$src2)>; - def : Pat<(X86psignd VR128:$src1, VR128:$src2), - (VPSIGNDrr128 VR128:$src1, VR128:$src2)>; + defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128, + memopv2i64, i128mem, SSE_PHADDSUBW>; + defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128, + memopv2i64, i128mem, SSE_PHADDSUBD>; + defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128, + memopv2i64, i128mem, SSE_PHADDSUBW>; + defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128, + memopv2i64, i128mem, SSE_PHADDSUBD>; + defm PSIGNB : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSIGNW : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSIGND : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128, + memopv2i64, i128mem, SSE_PSHUFB>; + defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", + int_x86_ssse3_phadd_sw_128, + SSE_PHADDSUBSW>; + defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", + int_x86_ssse3_phsub_sw_128, + SSE_PHADDSUBSW>; + defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", + int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>; +} +defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", + int_x86_ssse3_pmul_hr_sw_128, + SSE_PMULHRSW>; } //===---------------------------------------------------------------------===// @@ -4925,36 +5497,57 @@ let Predicates = [HasAVX] in { //===---------------------------------------------------------------------===// multiclass ssse3_palign<string asm, bit Is2Addr = 1> { + let neverHasSideEffects = 1 in { def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - []>, OpSize; + [], IIC_SSE_PALIGNR>, OpSize; + let mayLoad = 1 in def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [], IIC_SSE_PALIGNR>, OpSize; + } +} + +multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> { + let neverHasSideEffects = 1 in { + def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i8imm:$src3), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize; + let mayLoad = 1 in + def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2, i8imm:$src3), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, OpSize; + } } let Predicates = [HasAVX] in defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; +let Predicates = [HasAVX2] in + defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V; let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in defm PALIGN : ssse3_palign<"palignr">; -let Predicates = [HasSSSE3] in { -def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; -def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; -def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; -def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +let Predicates = [HasAVX2] in { +def : Pat<(v8i32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; +def : Pat<(v8f32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; +def : Pat<(v16i16 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; +def : Pat<(v32i8 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; } let Predicates = [HasAVX] in { @@ -4968,23 +5561,36 @@ def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; } +let Predicates = [HasSSSE3] in { +def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +} + //===---------------------------------------------------------------------===// // SSSE3 - Thread synchronization //===---------------------------------------------------------------------===// let usesCustomInserter = 1 in { def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), - [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>; + [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, + Requires<[HasSSE3]>; def MWAIT : PseudoI<(outs), (ins GR32:$src1, GR32:$src2), - [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>; + [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>, + Requires<[HasSSE3]>; } let Uses = [EAX, ECX, EDX] in -def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, TB, - Requires<[HasSSE3]>; +def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>, + TB, Requires<[HasSSE3]>; let Uses = [ECX, EAX] in -def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", []>, TB, - Requires<[HasSSE3]>; +def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", [], IIC_SSE_MWAIT>, + TB, Requires<[HasSSE3]>; def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>; def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>; @@ -5010,6 +5616,17 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } +multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + + def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize; +} + let Predicates = [HasAVX] in { defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, VEX; @@ -5025,6 +5642,21 @@ defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>, VEX; } +let Predicates = [HasAVX2] in { +defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw", + int_x86_avx2_pmovsxbw>, VEX; +defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd", + int_x86_avx2_pmovsxwd>, VEX; +defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq", + int_x86_avx2_pmovsxdq>, VEX; +defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw", + int_x86_avx2_pmovzxbw>, VEX; +defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd", + int_x86_avx2_pmovzxwd>, VEX; +defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", + int_x86_avx2_pmovzxdq>, VEX; +} + defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>; defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>; defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>; @@ -5032,70 +5664,80 @@ defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>; defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>; defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>; -let Predicates = [HasSSE41] in { +let Predicates = [HasAVX] in { // Common patterns involving scalar load. def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), - (PMOVSXBWrm addr:$src)>; + (VPMOVSXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), - (PMOVSXBWrm addr:$src)>; + (VPMOVSXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), - (PMOVSXWDrm addr:$src)>; + (VPMOVSXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), - (PMOVSXWDrm addr:$src)>; + (VPMOVSXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), - (PMOVSXDQrm addr:$src)>; + (VPMOVSXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), - (PMOVSXDQrm addr:$src)>; + (VPMOVSXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), - (PMOVZXBWrm addr:$src)>; + (VPMOVZXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), - (PMOVZXBWrm addr:$src)>; + (VPMOVZXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), - (PMOVZXWDrm addr:$src)>; + (VPMOVZXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), - (PMOVZXWDrm addr:$src)>; + (VPMOVZXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), - (PMOVZXDQrm addr:$src)>; + (VPMOVZXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), - (PMOVZXDQrm addr:$src)>; + (VPMOVZXDQrm addr:$src)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasSSE41] in { // Common patterns involving scalar load. def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), - (VPMOVSXBWrm addr:$src)>; + (PMOVSXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), - (VPMOVSXBWrm addr:$src)>; + (PMOVSXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), - (VPMOVSXWDrm addr:$src)>; + (PMOVSXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), - (VPMOVSXWDrm addr:$src)>; + (PMOVSXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), - (VPMOVSXDQrm addr:$src)>; + (PMOVSXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), - (VPMOVSXDQrm addr:$src)>; + (PMOVSXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), - (VPMOVZXBWrm addr:$src)>; + (PMOVZXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), - (VPMOVZXBWrm addr:$src)>; + (PMOVZXBWrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), - (VPMOVZXWDrm addr:$src)>; + (PMOVZXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), - (VPMOVZXWDrm addr:$src)>; + (PMOVZXWDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), - (VPMOVZXDQrm addr:$src)>; + (PMOVZXDQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), - (VPMOVZXDQrm addr:$src)>; + (PMOVZXDQrm addr:$src)>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; +def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; +} + +let Predicates = [HasSSE41] in { +def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; +def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; } @@ -5111,6 +5753,19 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } +multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + + def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, + OpSize; +} + let Predicates = [HasAVX] in { defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, VEX; @@ -5122,35 +5777,46 @@ defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>, VEX; } +let Predicates = [HasAVX2] in { +defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd", + int_x86_avx2_pmovsxbd>, VEX; +defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq", + int_x86_avx2_pmovsxwq>, VEX; +defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd", + int_x86_avx2_pmovzxbd>, VEX; +defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", + int_x86_avx2_pmovzxwq>, VEX; +} + defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>; defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>; defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>; defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>; -let Predicates = [HasSSE41] in { +let Predicates = [HasAVX] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), - (PMOVSXBDrm addr:$src)>; + (VPMOVSXBDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), - (PMOVSXWQrm addr:$src)>; + (VPMOVSXWQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), - (PMOVZXBDrm addr:$src)>; + (VPMOVZXBDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), - (PMOVZXWQrm addr:$src)>; + (VPMOVZXWQrm addr:$src)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasSSE41] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), - (VPMOVSXBDrm addr:$src)>; + (PMOVSXBDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), - (VPMOVSXWQrm addr:$src)>; + (PMOVSXWQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), - (VPMOVZXBDrm addr:$src)>; + (PMOVZXBDrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), - (VPMOVZXWQrm addr:$src)>; + (PMOVZXWQrm addr:$src)>; } multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { @@ -5166,39 +5832,59 @@ multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } +multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId VR128:$src))]>, OpSize; + + // Expecting a i16 load any extended to i32 value. + def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId (bitconvert + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, + OpSize; +} + let Predicates = [HasAVX] in { defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, VEX; defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, VEX; } +let Predicates = [HasAVX2] in { +defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", + int_x86_avx2_pmovsxbq>, VEX; +defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", + int_x86_avx2_pmovzxbq>, VEX; +} defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>; -let Predicates = [HasSSE41] in { +let Predicates = [HasAVX] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbq (bitconvert (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXBQrm addr:$src)>; + (VPMOVSXBQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbq (bitconvert (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXBQrm addr:$src)>; + (VPMOVZXBQrm addr:$src)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasSSE41] in { // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbq (bitconvert (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBQrm addr:$src)>; + (PMOVSXBQrm addr:$src)>; def : Pat<(int_x86_sse41_pmovzxbq (bitconvert (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXBQrm addr:$src)>; + (PMOVZXBQrm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -5213,6 +5899,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, OpSize; + let neverHasSideEffects = 1, mayStore = 1 in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, @@ -5235,6 +5922,7 @@ defm PEXTRB : SS41I_extract8<0x14, "pextrb">; /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { + let neverHasSideEffects = 1, mayStore = 1 in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, @@ -5311,26 +5999,28 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let Predicates = [HasAVX] in { - defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; - def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), - "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, OpSize, VEX; +let ExeDomain = SSEPackedSingle in { + let Predicates = [HasAVX] in { + defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; + def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, OpSize, VEX; + } + defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; } -defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; // Also match an EXTRACTPS store when the store is done as f32 instead of i32. def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))), addr:$dst), - (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, - Requires<[HasSSE41]>; + (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[HasAVX]>; def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))), addr:$dst), - (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, - Requires<[HasAVX]>; + (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[HasSSE41]>; //===----------------------------------------------------------------------===// // SSE4.1 - Insert Instructions @@ -5439,17 +6129,12 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3))]>, OpSize; } -let Constraints = "$src1 = $dst" in - defm INSERTPS : SS41I_insertf32<0x21, "insertps">; -let Predicates = [HasAVX] in - defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; - -def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), - (VINSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>, - Requires<[HasAVX]>; -def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), - (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>, - Requires<[HasSSE41]>; +let ExeDomain = SSEPackedSingle in { + let Predicates = [HasAVX] in + defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; + let Constraints = "$src1 = $dst" in + defm INSERTPS : SS41I_insertf32<0x21, "insertps">; +} //===----------------------------------------------------------------------===// // SSE4.1 - Round Instructions @@ -5459,6 +6144,7 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, PatFrag mem_frag32, PatFrag mem_frag64, Intrinsic V4F32Int, Intrinsic V2F64Int> { +let ExeDomain = SSEPackedSingle in { // Intrinsic operation, reg. // Vector intrinsic operation, reg def PSr : SS4AIi8<opcps, MRMSrcReg, @@ -5469,15 +6155,16 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, OpSize; // Vector intrinsic operation, mem - def PSm : Ii8<opcps, MRMSrcMem, + def PSm : SS4AIi8<opcps, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>, - TA, OpSize, - Requires<[HasSSE41]>; + OpSize; +} // ExeDomain = SSEPackedSingle +let ExeDomain = SSEPackedDouble in { // Vector intrinsic operation, reg def PDr : SS4AIi8<opcpd, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), @@ -5494,46 +6181,26 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, [(set RC:$dst, (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>, OpSize; -} - -multiclass sse41_fp_unop_rm_avx_p<bits<8> opcps, bits<8> opcpd, - RegisterClass RC, X86MemOperand x86memop, string OpcodeStr> { - // Intrinsic operation, reg. - // Vector intrinsic operation, reg - def PSr_AVX : SS4AIi8<opcps, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, OpSize; - - // Vector intrinsic operation, mem - def PSm_AVX : Ii8<opcps, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, TA, OpSize, Requires<[HasSSE41]>; - - // Vector intrinsic operation, reg - def PDr_AVX : SS4AIi8<opcpd, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, OpSize; - - // Vector intrinsic operation, mem - def PDm_AVX : SS4AIi8<opcpd, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, OpSize; +} // ExeDomain = SSEPackedDouble } multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, string OpcodeStr, Intrinsic F32Int, Intrinsic F64Int, bit Is2Addr = 1> { - // Intrinsic operation, reg. +let ExeDomain = GenericDomain in { + // Operation, reg. def SSr : SS4AIi8<opcss, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, OpSize; + + // Intrinsic operation, reg. + def SSr_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -5555,8 +6222,18 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, OpSize; - // Intrinsic operation, reg. + // Operation, reg. def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, OpSize; + + // Intrinsic operation, reg. + def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -5577,37 +6254,7 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, [(set VR128:$dst, (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, OpSize; -} - -multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd, - string OpcodeStr> { - // Intrinsic operation, reg. - def SSr_AVX : SS4AIi8<opcss, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, OpSize; - - // Intrinsic operation, mem. - def SSm_AVX : SS4AIi8<opcss, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, OpSize; - - // Intrinsic operation, reg. - def SDr_AVX : SS4AIi8<opcsd, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, OpSize; - - // Intrinsic operation, mem. - def SDm_AVX : SS4AIi8<opcsd, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, OpSize; +} // ExeDomain = GenericDomain } // FP round - roundss, roundps, roundsd, roundpd @@ -5625,12 +6272,26 @@ let Predicates = [HasAVX] in { int_x86_sse41_round_ss, int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; - // Instructions for the assembler - defm VROUND : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR128, f128mem, "vround">, - VEX; - defm VROUNDY : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR256, f256mem, "vround">, - VEX; - defm VROUND : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; + def : Pat<(ffloor FR32:$src), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + def : Pat<(f64 (ffloor FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + def : Pat<(f64 (fceil FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + def : Pat<(f32 (frint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f64 (frint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, @@ -5640,6 +6301,27 @@ let Constraints = "$src1 = $dst" in defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", int_x86_sse41_round_ss, int_x86_sse41_round_sd>; +def : Pat<(ffloor FR32:$src), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; +def : Pat<(f64 (ffloor FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; +def : Pat<(f32 (fnearbyint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; +def : Pat<(f64 (fnearbyint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; +def : Pat<(f32 (fceil FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; +def : Pat<(f64 (fceil FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; +def : Pat<(f32 (frint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; +def : Pat<(f64 (frint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; +def : Pat<(f32 (ftrunc FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; +def : Pat<(f64 (ftrunc FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// @@ -5649,11 +6331,11 @@ defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", let Defs = [EFLAGS], Predicates = [HasAVX] in { def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), "vptest\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>, + [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, OpSize, VEX; def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS,(X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>, + [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize, VEX; def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), @@ -5668,12 +6350,12 @@ def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), let Defs = [EFLAGS] in { def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), - "ptest \t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>, + "ptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, OpSize; def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), - "ptest \t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>, + "ptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize; } @@ -5690,11 +6372,15 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, } let Defs = [EFLAGS], Predicates = [HasAVX] in { +let ExeDomain = SSEPackedSingle in { defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>; defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>; +} +let ExeDomain = SSEPackedDouble in { defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>; defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>; } +} //===----------------------------------------------------------------------===// // SSE4.1 - Misc Instructions @@ -5743,7 +6429,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId128 - (bitconvert (memopv8i16 addr:$src))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src))))]>, OpSize; } let Predicates = [HasAVX] in @@ -5769,15 +6455,29 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; +} + +/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator +multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256> { + let isCommutable = 1 in + def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize; + def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (IntId256 VR256:$src1, + (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; } let Predicates = [HasAVX] in { let isCommutable = 0 in defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, 0>, VEX_4V; - defm VPCMPEQQ : SS41I_binop_rm_int<0x29, "vpcmpeqq", int_x86_sse41_pcmpeqq, - 0>, VEX_4V; defm VPMINSB : SS41I_binop_rm_int<0x38, "vpminsb", int_x86_sse41_pminsb, 0>, VEX_4V; defm VPMINSD : SS41I_binop_rm_int<0x39, "vpminsd", int_x86_sse41_pminsd, @@ -5796,17 +6496,35 @@ let Predicates = [HasAVX] in { 0>, VEX_4V; defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, 0>, VEX_4V; +} - def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)), - (VPCMPEQQrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))), - (VPCMPEQQrm VR128:$src1, addr:$src2)>; +let Predicates = [HasAVX2] in { + let isCommutable = 0 in + defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", + int_x86_avx2_packusdw>, VEX_4V; + defm VPMINSB : SS41I_binop_rm_int_y<0x38, "vpminsb", + int_x86_avx2_pmins_b>, VEX_4V; + defm VPMINSD : SS41I_binop_rm_int_y<0x39, "vpminsd", + int_x86_avx2_pmins_d>, VEX_4V; + defm VPMINUD : SS41I_binop_rm_int_y<0x3B, "vpminud", + int_x86_avx2_pminu_d>, VEX_4V; + defm VPMINUW : SS41I_binop_rm_int_y<0x3A, "vpminuw", + int_x86_avx2_pminu_w>, VEX_4V; + defm VPMAXSB : SS41I_binop_rm_int_y<0x3C, "vpmaxsb", + int_x86_avx2_pmaxs_b>, VEX_4V; + defm VPMAXSD : SS41I_binop_rm_int_y<0x3D, "vpmaxsd", + int_x86_avx2_pmaxs_d>, VEX_4V; + defm VPMAXUD : SS41I_binop_rm_int_y<0x3F, "vpmaxud", + int_x86_avx2_pmaxu_d>, VEX_4V; + defm VPMAXUW : SS41I_binop_rm_int_y<0x3E, "vpmaxuw", + int_x86_avx2_pmaxu_w>, VEX_4V; + defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", + int_x86_avx2_pmul_dq>, VEX_4V; } let Constraints = "$src1 = $dst" in { let isCommutable = 0 in defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>; - defm PCMPEQQ : SS41I_binop_rm_int<0x29, "pcmpeqq", int_x86_sse41_pcmpeqq>; defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb", int_x86_sse41_pminsb>; defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd", int_x86_sse41_pminsd>; defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud", int_x86_sse41_pminud>; @@ -5818,36 +6536,46 @@ let Constraints = "$src1 = $dst" in { defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>; } -def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)), - (PCMPEQQrr VR128:$src1, VR128:$src2)>; -def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))), - (PCMPEQQrm VR128:$src1, addr:$src2)>; - /// SS48I_binop_rm - Simple SSE41 binary operator. multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, bit Is2Addr = 1> { + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1> { let isCommutable = 1 in - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), + def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>, - OpSize; - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize; + def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (OpNode VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2))))]>, - OpSize; + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))]>, OpSize; } -let Predicates = [HasAVX] in - defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V; -let Constraints = "$src1 = $dst" in - defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>; +let Predicates = [HasAVX] in { + defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, + memopv2i64, i128mem, 0>, VEX_4V; + defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, + memopv2i64, i128mem, 0>, VEX_4V; +} +let Predicates = [HasAVX2] in { + defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, + memopv4i64, i256mem, 0>, VEX_4V; + defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, + memopv4i64, i256mem, 0>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { + defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, + memopv2i64, i128mem>; + defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, + memopv2i64, i128mem>; +} /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, @@ -5878,77 +6606,106 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in { let isCommutable = 0 in { - defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, memopv16i8, i128mem, 0>, VEX_4V; - defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, memopv16i8, i128mem, 0>, VEX_4V; - defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, memopv32i8, i256mem, 0>, VEX_4V; - defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256, VR256, memopv32i8, i256mem, 0>, VEX_4V; + let ExeDomain = SSEPackedSingle in { + defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, + VR128, memopv4f32, i128mem, 0>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", + int_x86_avx_blend_ps_256, VR256, memopv8f32, i256mem, 0>, VEX_4V; + } + let ExeDomain = SSEPackedDouble in { + defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, + VR128, memopv2f64, i128mem, 0>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", + int_x86_avx_blend_pd_256, VR256, memopv4f64, i256mem, 0>, VEX_4V; + } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv2i64, i128mem, 0>, VEX_4V; defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv2i64, i128mem, 0>, VEX_4V; } + let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv4f32, i128mem, 0>, VEX_4V; + let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, - VR128, memopv16i8, i128mem, 0>, VEX_4V; + VR128, memopv2f64, i128mem, 0>, VEX_4V; + let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, - VR256, memopv32i8, i256mem, 0>, VEX_4V; + VR256, memopv8f32, i256mem, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + let isCommutable = 0 in { + defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, + VR256, memopv4i64, i256mem, 0>, VEX_4V; + defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, + VR256, memopv4i64, i256mem, 0>, VEX_4V; + } } let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { + let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, - VR128, memopv16i8, i128mem>; + VR128, memopv4f32, i128mem>; + let ExeDomain = SSEPackedDouble in defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, - VR128, memopv16i8, i128mem>; + VR128, memopv2f64, i128mem>; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, - VR128, memopv16i8, i128mem>; + VR128, memopv2i64, i128mem>; defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv16i8, i128mem>; + VR128, memopv2i64, i128mem>; } + let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, - VR128, memopv16i8, i128mem>; + VR128, memopv4f32, i128mem>; + let ExeDomain = SSEPackedDouble in defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, - VR128, memopv16i8, i128mem>; + VR128, memopv2f64, i128mem>; } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -let Predicates = [HasAVX] in { multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, Intrinsic IntId> { - def rr : I<opc, MRMSrcReg, (outs RC:$dst), + def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], - SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; + IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; - def rm : I<opc, MRMSrcMem, (outs RC:$dst), + def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), RC:$src3))], - SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; -} + IIC_DEFAULT, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; } +let Predicates = [HasAVX] in { +let ExeDomain = SSEPackedDouble in { defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem, - memopv16i8, int_x86_sse41_blendvpd>; -defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem, - memopv16i8, int_x86_sse41_blendvps>; -defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - memopv16i8, int_x86_sse41_pblendvb>; + memopv2f64, int_x86_sse41_blendvpd>; defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem, - memopv32i8, int_x86_avx_blendv_pd_256>; + memopv4f64, int_x86_avx_blendv_pd_256>; +} // ExeDomain = SSEPackedDouble +let ExeDomain = SSEPackedSingle in { +defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem, + memopv4f32, int_x86_sse41_blendvps>; defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem, - memopv32i8, int_x86_avx_blendv_ps_256>; + memopv8f32, int_x86_avx_blendv_ps_256>; +} // ExeDomain = SSEPackedSingle +defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, + memopv2i64, int_x86_sse41_pblendvb>; +} + +let Predicates = [HasAVX2] in { +defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, + memopv4i64, int_x86_avx2_pblendvb>; +} let Predicates = [HasAVX] in { def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), @@ -5978,11 +6735,28 @@ let Predicates = [HasAVX] in { def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), (v4f64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + + def : Pat<(v8f32 (X86Blendps (v8f32 VR256:$src1), (v8f32 VR256:$src2), + (imm:$mask))), + (VBLENDPSYrri VR256:$src2, VR256:$src1, imm:$mask)>; + def : Pat<(v4f64 (X86Blendpd (v4f64 VR256:$src1), (v4f64 VR256:$src2), + (imm:$mask))), + (VBLENDPDYrri VR256:$src2, VR256:$src1, imm:$mask)>; +} + +let Predicates = [HasAVX2] in { + def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), + (v32i8 VR256:$src2))), + (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v16i16 (X86Blendpw (v16i16 VR256:$src1), (v16i16 VR256:$src2), + (imm:$mask))), + (VPBLENDWYrri VR256:$src2, VR256:$src1, imm:$mask)>; } /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { - multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + Intrinsic IntId> { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, @@ -5996,13 +6770,18 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, - (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize; + (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize; } } -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>; -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>; +let ExeDomain = SSEPackedDouble in +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, + int_x86_sse41_blendvpd>; +let ExeDomain = SSEPackedSingle in +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, + int_x86_sse41_blendvps>; +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, + int_x86_sse41_pblendvb>; let Predicates = [HasSSE41] in { def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), @@ -6020,6 +6799,17 @@ let Predicates = [HasSSE41] in { def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), (v2f64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; + + def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2), + (imm:$mask))), + (VPBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>; + def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2), + (imm:$mask))), + (VBLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>; + def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2), + (imm:$mask))), + (VBLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>; + } let Predicates = [HasAVX] in @@ -6027,6 +6817,11 @@ def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, OpSize, VEX; +let Predicates = [HasAVX2] in +def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, + OpSize, VEX; def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, @@ -6036,43 +6831,37 @@ def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), // SSE4.2 - Compare Instructions //===----------------------------------------------------------------------===// -/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator -multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, bit Is2Addr = 1> { - def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), +/// SS42I_binop_rm - Simple SSE 4.2 binary operator +multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1> { + def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize; - def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), + def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, - (IntId128 VR128:$src1, - (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; + [(set RC:$dst, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize; } -let Predicates = [HasAVX] in { - defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq, - 0>, VEX_4V; +let Predicates = [HasAVX] in + defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, + memopv2i64, i128mem, 0>, VEX_4V; - def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)), - (VPCMPGTQrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))), - (VPCMPGTQrm VR128:$src1, addr:$src2)>; -} +let Predicates = [HasAVX2] in + defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, + memopv4i64, i256mem, 0>, VEX_4V; let Constraints = "$src1 = $dst" in - defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>; - -def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)), - (PCMPGTQrr VR128:$src1, VR128:$src2)>; -def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))), - (PCMPGTQrm VR128:$src1, addr:$src2)>; + defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, + memopv2i64, i128mem>; //===----------------------------------------------------------------------===// // SSE4.2 - String/text Processing Instructions @@ -6091,23 +6880,26 @@ multiclass pseudo_pcmpistrm<string asm> { } let Defs = [EFLAGS], usesCustomInserter = 1 in { + let AddedComplexity = 1 in + defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>; - defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; } -let Defs = [XMM0, EFLAGS], Predicates = [HasAVX] in { +let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in { def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; + let mayLoad = 1 in def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; } -let Defs = [XMM0, EFLAGS] in { +let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in { def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize; + let mayLoad = 1 in def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize; @@ -6126,24 +6918,27 @@ multiclass pseudo_pcmpestrm<string asm> { } let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { + let AddedComplexity = 1 in + defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>; - defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; } let Predicates = [HasAVX], - Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in { + Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, i8imm:$src5), "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX; + let mayLoad = 1 in def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src3, i8imm:$src5), "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX; } -let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in { +let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, i8imm:$src5), "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize; + let mayLoad = 1 in def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src3, i8imm:$src5), "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize; @@ -6318,8 +7113,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, - (IntId128 VR128:$src1, - (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; + (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize; } // Perform One Round of an AES Encryption/Decryption Flow @@ -6345,44 +7139,6 @@ let Constraints = "$src1 = $dst" in { int_x86_aesni_aesdeclast>; } -let Predicates = [HasAES] in { - def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)), - (AESENCrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))), - (AESENCrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)), - (AESENCLASTrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))), - (AESENCLASTrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)), - (AESDECrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))), - (AESDECrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)), - (AESDECLASTrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))), - (AESDECLASTrm VR128:$src1, addr:$src2)>; -} - -let Predicates = [HasAVX, HasAES], AddedComplexity = 20 in { - def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)), - (VAESENCrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))), - (VAESENCrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)), - (VAESENCLASTrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))), - (VAESENCLASTrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)), - (VAESDECrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))), - (VAESDECrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)), - (VAESDECLASTrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))), - (VAESDECLASTrm VR128:$src1, addr:$src2)>; -} - // Perform the AES InvMixColumn Transformation let Predicates = [HasAVX, HasAES] in { def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), @@ -6394,8 +7150,7 @@ let Predicates = [HasAVX, HasAES] in { def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", - [(set VR128:$dst, - (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>, + [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, OpSize, VEX; } def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), @@ -6407,8 +7162,7 @@ def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "aesimc\t{$src1, $dst|$dst, $src1}", - [(set VR128:$dst, - (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>, + [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, OpSize; // AES Round Key Generation Assist @@ -6423,8 +7177,7 @@ let Predicates = [HasAVX, HasAES] in { (ins i128mem:$src1, i8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)), - imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, OpSize, VEX; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), @@ -6437,8 +7190,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)), - imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, OpSize; //===----------------------------------------------------------------------===// @@ -6446,28 +7198,32 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), //===----------------------------------------------------------------------===// // Carry-less Multiplication instructions -let Constraints = "$src1 = $dst" in { -def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), +let neverHasSideEffects = 1 in { +// AVX carry-less Multiplication instructions +def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), - "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>; -def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), +let mayLoad = 1 in +def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), - "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>; -} -// AVX carry-less Multiplication instructions -def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), +let Constraints = "$src1 = $dst" in { +def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), - "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>; -def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), +let mayLoad = 1 in +def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), - "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>; +} // Constraints = "$src1 = $dst" +} // neverHasSideEffects = 1 multiclass pclmul_alias<string asm, int immop> { @@ -6506,51 +7262,60 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (Int addr:$src))]>, VEX; -def VBROADCASTSS : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, - int_x86_avx_vbroadcastss>; -def VBROADCASTSSY : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, - int_x86_avx_vbroadcastss_256>; -def VBROADCASTSD : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, - int_x86_avx_vbroadcast_sd_256>; +// AVX2 adds register forms +class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, + Intrinsic Int> : + AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (Int VR128:$src))]>, VEX; + +let ExeDomain = SSEPackedSingle in { + def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, + int_x86_avx_vbroadcast_ss>; + def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, + int_x86_avx_vbroadcast_ss_256>; +} +let ExeDomain = SSEPackedDouble in +def VBROADCASTSDrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, + int_x86_avx_vbroadcast_sd_256>; def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, int_x86_avx_vbroadcastf128_pd_256>; +let ExeDomain = SSEPackedSingle in { + def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, + int_x86_avx2_vbroadcast_ss_ps>; + def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, + int_x86_avx2_vbroadcast_ss_ps_256>; +} +let ExeDomain = SSEPackedDouble in +def VBROADCASTSDrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, + int_x86_avx2_vbroadcast_sd_pd_256>; + +let Predicates = [HasAVX2] in +def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, + int_x86_avx2_vbroadcasti128>; + +let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; -def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VBROADCASTSSY addr:$src)>; -def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VBROADCASTSD addr:$src)>; -def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSY addr:$src)>; -def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSD addr:$src)>; - -def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSS addr:$src)>; -def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VBROADCASTSS addr:$src)>; //===----------------------------------------------------------------------===// // VINSERTF128 - Insert packed floating-point values // +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, VEX_4V; +let mayLoad = 1 in def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f128mem:$src2, i8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, VEX_4V; +} -def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3), - (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; -def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3), - (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; -def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3), - (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; - +let Predicates = [HasAVX] in { def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), (i32 imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, @@ -6559,11 +7324,11 @@ def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), (i32 imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (i32 imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), (i32 imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; @@ -6576,18 +7341,54 @@ def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (VINSERTF128rr VR256:$src1, VR128:$src2, (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2), + (i32 imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), + (i32 imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), + (i32 imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +} + //===----------------------------------------------------------------------===// // VEXTRACTF128 - Extract packed floating-point values // +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), (ins VR256:$src1, i8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX; +let mayStore = 1 in def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), (ins f128mem:$dst, VR256:$src1, i8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX; +} +// Extract and store. +let Predicates = [HasAVX] in { + def : Pat<(alignedstore (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; + def : Pat<(alignedstore (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; + def : Pat<(alignedstore (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; + + def : Pat<(int_x86_sse_storeu_ps addr:$dst, (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2)), + (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2)), + (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; + def : Pat<(int_x86_sse2_storeu_dq addr:$dst, (bc_v16i8 (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2))), + (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>; +} + +// AVX1 patterns +let Predicates = [HasAVX] in { def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), (VEXTRACTF128rr VR256:$src1, imm:$src2)>; def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), @@ -6604,14 +7405,14 @@ def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), (v4f64 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), - (v4i32 (VEXTRACTF128rr - (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), (v2i64 (VEXTRACTF128rr (v4i64 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4i32 (VEXTRACTF128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), (v8i16 (VEXTRACTF128rr (v16i16 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; @@ -6619,14 +7420,14 @@ def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), (v16i8 (VEXTRACTF128rr (v32i8 VR256:$src1), (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +} //===----------------------------------------------------------------------===// // VMASKMOV - Conditional SIMD Packed Loads and Stores // multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, Intrinsic IntLd, Intrinsic IntLd256, - Intrinsic IntSt, Intrinsic IntSt256, - PatFrag pf128, PatFrag pf256> { + Intrinsic IntSt, Intrinsic IntSt256> { def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -6647,26 +7448,26 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V; } +let ExeDomain = SSEPackedSingle in defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", int_x86_avx_maskload_ps, int_x86_avx_maskload_ps_256, int_x86_avx_maskstore_ps, - int_x86_avx_maskstore_ps_256, - memopv4f32, memopv8f32>; + int_x86_avx_maskstore_ps_256>; +let ExeDomain = SSEPackedDouble in defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", int_x86_avx_maskload_pd, int_x86_avx_maskload_pd_256, int_x86_avx_maskstore_pd, - int_x86_avx_maskstore_pd_256, - memopv2f64, memopv4f64>; + int_x86_avx_maskstore_pd_256>; //===----------------------------------------------------------------------===// // VPERMIL - Permute Single and Double Floating-Point Values // multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop_f, - X86MemOperand x86memop_i, PatFrag f_frag, PatFrag i_frag, - Intrinsic IntVar, Intrinsic IntImm> { + X86MemOperand x86memop_i, PatFrag i_frag, + Intrinsic IntVar, ValueType vt> { def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -6674,86 +7475,98 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop_i:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (IntVar RC:$src1, (i_frag addr:$src2)))]>, VEX_4V; + [(set RC:$dst, (IntVar RC:$src1, + (bitconvert (i_frag addr:$src2))))]>, VEX_4V; def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (IntImm RC:$src1, imm:$src2))]>, VEX; + [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (IntImm (f_frag addr:$src1), imm:$src2))]>, VEX; -} - -defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, - memopv4f32, memopv4i32, - int_x86_avx_vpermilvar_ps, - int_x86_avx_vpermil_ps>; -defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, - memopv8f32, memopv8i32, - int_x86_avx_vpermilvar_ps_256, - int_x86_avx_vpermil_ps_256>; -defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, - memopv2f64, memopv2i64, - int_x86_avx_vpermilvar_pd, - int_x86_avx_vpermil_pd>; -defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, - memopv4f64, memopv4i64, - int_x86_avx_vpermilvar_pd_256, - int_x86_avx_vpermil_pd_256>; - -def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))), - (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))), - (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))), + [(set RC:$dst, + (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX; +} + +let ExeDomain = SSEPackedSingle in { + defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, + memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>; + defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, + memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>; +} +let ExeDomain = SSEPackedDouble in { + defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, + memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>; + defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, + memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; +def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)), + (i8 imm:$imm))), + (VPERMILPSYmi addr:$src1, imm:$imm)>; +def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDYmi addr:$src1, imm:$imm)>; + +def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), + (VPERMILPDri VR128:$src1, imm:$imm)>; +def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDmi addr:$src1, imm:$imm)>; +} //===----------------------------------------------------------------------===// // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks // +let ExeDomain = SSEPackedSingle in { def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V; + [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, + (i8 imm:$src3))))]>, VEX_4V; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, i8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, VEX_4V; + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2), + (i8 imm:$src3)))]>, VEX_4V; +} -def : Pat<(int_x86_avx_vperm2f128_ps_256 VR256:$src1, VR256:$src2, imm:$src3), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>; -def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, VR256:$src2, imm:$src3), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>; -def : Pat<(int_x86_avx_vperm2f128_si_256 VR256:$src1, VR256:$src2, imm:$src3), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>; - -def : Pat<(int_x86_avx_vperm2f128_ps_256 - VR256:$src1, (memopv8f32 addr:$src2), imm:$src3), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(int_x86_avx_vperm2f128_pd_256 - VR256:$src1, (memopv4f64 addr:$src2), imm:$src3), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(int_x86_avx_vperm2f128_si_256 - VR256:$src1, (memopv8i32 addr:$src2), imm:$src3), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; - -def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), +let Predicates = [HasAVX] in { +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1, + (memopv8f32 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, + (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, + (memopv4i64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, + (memopv4f64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, + (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, + (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + //===----------------------------------------------------------------------===// // VZERO - Zero YMM registers // @@ -6770,30 +7583,362 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, //===----------------------------------------------------------------------===// // Half precision conversion instructions -// +//===----------------------------------------------------------------------===// +multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { let Predicates = [HasAVX, HasF16C] in { - def VCVTPH2PSrm : I<0x13, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; - def VCVTPH2PSrr : I<0x13, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; - def VCVTPH2PSYrm : I<0x13, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; - def VCVTPH2PSYrr : I<0x13, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; - def VCVTPS2PHmr : Ii8<0x1D, MRMDestMem, (outs f64mem:$dst), - (ins VR128:$src1, i32i8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - TA, OpSize, VEX; - def VCVTPS2PHrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), - (ins VR128:$src1, i32i8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - TA, OpSize, VEX; - def VCVTPS2PHYmr : Ii8<0x1D, MRMDestMem, (outs f128mem:$dst), - (ins VR256:$src1, i32i8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - TA, OpSize, VEX; - def VCVTPS2PHYrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), - (ins VR256:$src1, i32i8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - TA, OpSize, VEX; + def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set RC:$dst, (Int VR128:$src))]>, + T8, OpSize, VEX; + let neverHasSideEffects = 1, mayLoad = 1 in + def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; +} +} + +multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { +let Predicates = [HasAVX, HasF16C] in { + def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), + (ins RC:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, + TA, OpSize, VEX; + let neverHasSideEffects = 1, mayLoad = 1 in + def mr : Ii8<0x1D, MRMDestMem, (outs x86memop:$dst), + (ins RC:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + TA, OpSize, VEX; +} +} + +defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; +defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>; +defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; +defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>; + +//===----------------------------------------------------------------------===// +// AVX2 Instructions +//===----------------------------------------------------------------------===// + +/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate +multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { + let isCommutable = 1 in + def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u32u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + VEX_4V; + def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (IntId RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + VEX_4V; } + +let isCommutable = 0 in { +defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, + VR128, memopv2i64, i128mem>; +defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, + VR256, memopv4i64, i256mem>; +} + +//===----------------------------------------------------------------------===// +// VPBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + Intrinsic Int128, Intrinsic Int256> { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int128 VR128:$src))]>, VEX; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int256 VR128:$src))]>, VEX; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, VEX; +} + +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, + int_x86_avx2_pbroadcastb_128, + int_x86_avx2_pbroadcastb_256>; +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, + int_x86_avx2_pbroadcastw_128, + int_x86_avx2_pbroadcastw_256>; +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, + int_x86_avx2_pbroadcastd_128, + int_x86_avx2_pbroadcastd_256>; +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, + int_x86_avx2_pbroadcastq_128, + int_x86_avx2_pbroadcastq_256>; + +let Predicates = [HasAVX2] in { + def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), + (VPBROADCASTBrm addr:$src)>; + def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), + (VPBROADCASTBYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDrm addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDYrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VPBROADCASTQrm addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VPBROADCASTQYrm addr:$src)>; +} + +// AVX1 broadcast patterns +let Predicates = [HasAVX] in { +def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSYrm addr:$src)>; +def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VBROADCASTSDrm addr:$src)>; +def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSYrm addr:$src)>; +def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSDrm addr:$src)>; + +def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; +def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// VPERM - Permute instructions +// + +multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + Intrinsic Int> { + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (Int VR256:$src1, VR256:$src2))]>, VEX_4V; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (Int VR256:$src1, + (bitconvert (mem_frag addr:$src2))))]>, + VEX_4V; +} + +defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, int_x86_avx2_permd>; +let ExeDomain = SSEPackedSingle in +defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, int_x86_avx2_permps>; + +multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + Intrinsic Int> { + def Yrr : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (Int VR256:$src1, imm:$src2))]>, VEX; + def Yrm : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (Int (mem_frag addr:$src1), imm:$src2))]>, + VEX; +} + +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, int_x86_avx2_permq>, + VEX_W; +let ExeDomain = SSEPackedDouble in +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>, + VEX_W; + +//===----------------------------------------------------------------------===// +// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks +// +let AddedComplexity = 1 in { +def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i8imm:$src3), + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, + (i8 imm:$src3))))]>, VEX_4V; +def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), + (i8 imm:$src3)))]>, VEX_4V; +} + +let Predicates = [HasAVX2], AddedComplexity = 1 in { +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, + (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + + +//===----------------------------------------------------------------------===// +// VINSERTI128 - Insert packed integer values +// +let neverHasSideEffects = 1 in { +def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, i8imm:$src3), + "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, + VEX_4V; +def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i128mem:$src2, i8imm:$src3), + "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; +} + +let Predicates = [HasAVX2], AddedComplexity = 1 in { +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (i32 imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (i32 imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (i32 imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (i32 imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +} + +//===----------------------------------------------------------------------===// +// VEXTRACTI128 - Extract packed integer values +// +def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, i8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, + VEX; +let neverHasSideEffects = 1, mayStore = 1 in +def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), + (ins i128mem:$dst, VR256:$src1, i8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX; + +let Predicates = [HasAVX2], AddedComplexity = 1 in { +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v2i64 (VEXTRACTI128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4i32 (VEXTRACTI128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v8i16 (VEXTRACTI128rr + (v16i16 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v16i8 (VEXTRACTI128rr + (v32i8 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +} + +//===----------------------------------------------------------------------===// +// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores +// +multiclass avx2_pmovmask<string OpcodeStr, + Intrinsic IntLd128, Intrinsic IntLd256, + Intrinsic IntSt128, Intrinsic IntSt256> { + def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V; + def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, VEX_4V; + def mr : AVX28I<0x8e, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; + def Ymr : AVX28I<0x8e, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V; +} + +defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", + int_x86_avx2_maskload_d, + int_x86_avx2_maskload_d_256, + int_x86_avx2_maskstore_d, + int_x86_avx2_maskstore_d_256>; +defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", + int_x86_avx2_maskload_q, + int_x86_avx2_maskload_q_256, + int_x86_avx2_maskstore_q, + int_x86_avx2_maskstore_q_256>, VEX_W; + + +//===----------------------------------------------------------------------===// +// Variable Bit Shifts +// +multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, ValueType vt256> { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, + VEX_4V; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, + (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>, + VEX_4V; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, + VEX_4V; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, + (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>, + VEX_4V; +} + +defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; +defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; +defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; +defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; +defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td new file mode 100644 index 0000000..757dcd0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td @@ -0,0 +1,62 @@ +//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the AMD SVM instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SVM instructions + +// 0F 01 D9 +def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB; + +// 0F 01 DC +def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB; + +// 0F 01 DD +def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; + +// 0F 01 DE +let Uses = [EAX] in +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB; + +// 0F 01 D8 +let Uses = [EAX] in +def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), + "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; +let Uses = [RAX] in +def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), + "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DA +let Uses = [EAX] in +def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), + "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; +let Uses = [RAX] in +def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), + "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DB +let Uses = [EAX] in +def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), + "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; +let Uses = [RAX] in +def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), + "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DF +let Uses = [EAX, ECX] in +def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), + "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>; +let Uses = [RAX, ECX] in +def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), + "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index 8278568..bdeb63f 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -1,10 +1,10 @@ -//===- X86InstrShiftRotate.td - Shift and Rotate Instrs ----*- tablegen -*-===// -// +//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the shift and rotate instructions. @@ -19,44 +19,46 @@ let Constraints = "$src1 = $dst" in { let Uses = [CL] in { def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1), "shl{b}\t{%cl, $dst|$dst, CL}", - [(set GR8:$dst, (shl GR8:$src1, CL))]>; + [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>; def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1), "shl{w}\t{%cl, $dst|$dst, CL}", - [(set GR16:$dst, (shl GR16:$src1, CL))]>, OpSize; + [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize; def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1), "shl{l}\t{%cl, $dst|$dst, CL}", - [(set GR32:$dst, (shl GR32:$src1, CL))]>; + [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>; def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), - "shl{q}\t{%cl, $dst|$dst, %CL}", - [(set GR64:$dst, (shl GR64:$src1, CL))]>; + "shl{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; } // Uses = [CL] def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), "shl{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>; + [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), "shl{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize; + [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, + OpSize; def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "shl{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>; + [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>; def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "shl{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>; + [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; // NOTE: We don't include patterns for shifts of a register by one, because // 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one). def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1), - "shl{b}\t$dst", []>; + "shl{b}\t$dst", [], IIC_SR>; def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1), - "shl{w}\t$dst", []>, OpSize; + "shl{w}\t$dst", [], IIC_SR>, OpSize; def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1), - "shl{l}\t$dst", []>; + "shl{l}\t$dst", [], IIC_SR>; def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), - "shl{q}\t$dst", []>; + "shl{q}\t$dst", [], IIC_SR>; } // isConvertibleToThreeAddress = 1 } // Constraints = "$src = $dst" @@ -66,223 +68,266 @@ def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), let Uses = [CL] in { def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst), "shl{b}\t{%cl, $dst|$dst, CL}", - [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>; + [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst), "shl{w}\t{%cl, $dst|$dst, CL}", - [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize; + [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, + OpSize; def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), "shl{l}\t{%cl, $dst|$dst, CL}", - [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>; + [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), - "shl{q}\t{%cl, $dst|$dst, %CL}", - [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>; + "shl{q}\t{%cl, $dst|$dst, CL}", + [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), "shl{b}\t{$src, $dst|$dst, $src}", - [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src), "shl{w}\t{$src, $dst|$dst, $src}", - [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize; def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src), "shl{l}\t{$src, $dst|$dst, $src}", - [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src), "shl{q}\t{$src, $dst|$dst, $src}", - [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; // Shift by 1 def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst), "shl{b}\t$dst", - [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst), "shl{w}\t$dst", - [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize; def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst), "shl{l}\t$dst", - [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), "shl{q}\t$dst", - [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; let Constraints = "$src1 = $dst" in { let Uses = [CL] in { def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1), "shr{b}\t{%cl, $dst|$dst, CL}", - [(set GR8:$dst, (srl GR8:$src1, CL))]>; + [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>; def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1), "shr{w}\t{%cl, $dst|$dst, CL}", - [(set GR16:$dst, (srl GR16:$src1, CL))]>, OpSize; + [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize; def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1), "shr{l}\t{%cl, $dst|$dst, CL}", - [(set GR32:$dst, (srl GR32:$src1, CL))]>; + [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>; def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), - "shr{q}\t{%cl, $dst|$dst, %CL}", - [(set GR64:$dst, (srl GR64:$src1, CL))]>; + "shr{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; } def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), "shr{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>; + [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), "shr{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize; + [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize; def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "shr{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>; + [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))], + IIC_SR>; def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "shr{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>; + [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; // Shift right by 1 def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1), "shr{b}\t$dst", - [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>; + [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>; def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1), "shr{w}\t$dst", - [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize; + [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize; def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1), "shr{l}\t$dst", - [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>; + [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>; def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1), "shr{q}\t$dst", - [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>; + [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>; } // Constraints = "$src = $dst" let Uses = [CL] in { def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst), "shr{b}\t{%cl, $dst|$dst, CL}", - [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>; + [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst), "shr{w}\t{%cl, $dst|$dst, CL}", - [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>, + [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), "shr{l}\t{%cl, $dst|$dst, CL}", - [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>; + [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), - "shr{q}\t{%cl, $dst|$dst, %CL}", - [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>; + "shr{q}\t{%cl, $dst|$dst, CL}", + [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), "shr{b}\t{$src, $dst|$dst, $src}", - [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src), "shr{w}\t{$src, $dst|$dst, $src}", - [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize; def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src), "shr{l}\t{$src, $dst|$dst, $src}", - [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src), "shr{q}\t{$src, $dst|$dst, $src}", - [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; // Shift by 1 def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst), "shr{b}\t$dst", - [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst), "shr{w}\t$dst", - [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize; + [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>,OpSize; def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst), "shr{l}\t$dst", - [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), "shr{q}\t$dst", - [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; let Constraints = "$src1 = $dst" in { let Uses = [CL] in { def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), "sar{b}\t{%cl, $dst|$dst, CL}", - [(set GR8:$dst, (sra GR8:$src1, CL))]>; + [(set GR8:$dst, (sra GR8:$src1, CL))], + IIC_SR>; def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1), "sar{w}\t{%cl, $dst|$dst, CL}", - [(set GR16:$dst, (sra GR16:$src1, CL))]>, OpSize; + [(set GR16:$dst, (sra GR16:$src1, CL))], + IIC_SR>, OpSize; def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1), "sar{l}\t{%cl, $dst|$dst, CL}", - [(set GR32:$dst, (sra GR32:$src1, CL))]>; + [(set GR32:$dst, (sra GR32:$src1, CL))], + IIC_SR>; def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), - "sar{q}\t{%cl, $dst|$dst, %CL}", - [(set GR64:$dst, (sra GR64:$src1, CL))]>; + "sar{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (sra GR64:$src1, CL))], + IIC_SR>; } def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), "sar{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>; + [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))], + IIC_SR>; def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), "sar{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>, + [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize; def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "sar{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>; + [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))], + IIC_SR>; def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "sar{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>; + [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; // Shift by 1 def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), "sar{b}\t$dst", - [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>; + [(set GR8:$dst, (sra GR8:$src1, (i8 1)))], + IIC_SR>; def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1), "sar{w}\t$dst", - [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize; + [(set GR16:$dst, (sra GR16:$src1, (i8 1)))], + IIC_SR>, OpSize; def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1), "sar{l}\t$dst", - [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>; + [(set GR32:$dst, (sra GR32:$src1, (i8 1)))], + IIC_SR>; def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1), "sar{q}\t$dst", - [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>; + [(set GR64:$dst, (sra GR64:$src1, (i8 1)))], + IIC_SR>; } // Constraints = "$src = $dst" let Uses = [CL] in { def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst), "sar{b}\t{%cl, $dst|$dst, CL}", - [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>; + [(store (sra (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), "sar{w}\t{%cl, $dst|$dst, CL}", - [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize; + [(store (sra (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize; def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), "sar{l}\t{%cl, $dst|$dst, CL}", - [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>; + [(store (sra (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>; def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), - "sar{q}\t{%cl, $dst|$dst, %CL}", - [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>; + "sar{q}\t{%cl, $dst|$dst, CL}", + [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; } def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src), "sar{b}\t{$src, $dst|$dst, $src}", - [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src), "sar{w}\t{$src, $dst|$dst, $src}", - [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize; def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src), "sar{l}\t{$src, $dst|$dst, $src}", - [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src), "sar{q}\t{$src, $dst|$dst, $src}", - [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; // Shift by 1 def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst), "sar{b}\t$dst", - [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst), "sar{w}\t$dst", - [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize; def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst), "sar{l}\t$dst", - [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), "sar{q}\t$dst", - [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; //===----------------------------------------------------------------------===// // Rotate instructions @@ -290,125 +335,125 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst" in { def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), - "rcl{b}\t$dst", []>; + "rcl{b}\t$dst", [], IIC_SR>; def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), - "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), - "rcl{b}\t{%cl, $dst|$dst, CL}", []>; + "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), - "rcl{w}\t$dst", []>, OpSize; + "rcl{w}\t$dst", [], IIC_SR>, OpSize; def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), - "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize; + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; let Uses = [CL] in def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), - "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize; + "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), - "rcl{l}\t$dst", []>; + "rcl{l}\t$dst", [], IIC_SR>; def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), - "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), - "rcl{l}\t{%cl, $dst|$dst, CL}", []>; + "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), - "rcl{q}\t$dst", []>; + "rcl{q}\t$dst", [], IIC_SR>; def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), - "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), - "rcl{q}\t{%cl, $dst|$dst, CL}", []>; + "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), - "rcr{b}\t$dst", []>; + "rcr{b}\t$dst", [], IIC_SR>; def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), - "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), - "rcr{b}\t{%cl, $dst|$dst, CL}", []>; + "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), - "rcr{w}\t$dst", []>, OpSize; + "rcr{w}\t$dst", [], IIC_SR>, OpSize; def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), - "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize; + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; let Uses = [CL] in def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), - "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize; + "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), - "rcr{l}\t$dst", []>; + "rcr{l}\t$dst", [], IIC_SR>; def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), - "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), - "rcr{l}\t{%cl, $dst|$dst, CL}", []>; + "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), - "rcr{q}\t$dst", []>; + "rcr{q}\t$dst", [], IIC_SR>; def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), - "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), - "rcr{q}\t{%cl, $dst|$dst, CL}", []>; + "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; } // Constraints = "$src = $dst" def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), - "rcl{b}\t$dst", []>; + "rcl{b}\t$dst", [], IIC_SR>; def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt), - "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst), - "rcl{w}\t$dst", []>, OpSize; + "rcl{w}\t$dst", [], IIC_SR>, OpSize; def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt), - "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize; + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), - "rcl{l}\t$dst", []>; + "rcl{l}\t$dst", [], IIC_SR>; def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt), - "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), - "rcl{q}\t$dst", []>; + "rcl{q}\t$dst", [], IIC_SR>; def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt), - "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), - "rcr{b}\t$dst", []>; + "rcr{b}\t$dst", [], IIC_SR>; def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt), - "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst), - "rcr{w}\t$dst", []>, OpSize; + "rcr{w}\t$dst", [], IIC_SR>, OpSize; def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt), - "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize; + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), - "rcr{l}\t$dst", []>; + "rcr{l}\t$dst", [], IIC_SR>; def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt), - "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), - "rcr{q}\t$dst", []>; + "rcr{q}\t$dst", [], IIC_SR>; def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), - "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>; + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in { def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), - "rcl{b}\t{%cl, $dst|$dst, CL}", []>; + "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), - "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize; + "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), - "rcl{l}\t{%cl, $dst|$dst, CL}", []>; + "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), - "rcl{q}\t{%cl, $dst|$dst, CL}", []>; + "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), - "rcr{b}\t{%cl, $dst|$dst, CL}", []>; + "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), - "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize; + "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), - "rcr{l}\t{%cl, $dst|$dst, CL}", []>; + "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), - "rcr{q}\t{%cl, $dst|$dst, CL}", []>; + "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; } let Constraints = "$src1 = $dst" in { @@ -416,179 +461,217 @@ let Constraints = "$src1 = $dst" in { let Uses = [CL] in { def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), "rol{b}\t{%cl, $dst|$dst, CL}", - [(set GR8:$dst, (rotl GR8:$src1, CL))]>; + [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>; def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1), "rol{w}\t{%cl, $dst|$dst, CL}", - [(set GR16:$dst, (rotl GR16:$src1, CL))]>, OpSize; + [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize; def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1), "rol{l}\t{%cl, $dst|$dst, CL}", - [(set GR32:$dst, (rotl GR32:$src1, CL))]>; + [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>; def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), - "rol{q}\t{%cl, $dst|$dst, %CL}", - [(set GR64:$dst, (rotl GR64:$src1, CL))]>; + "rol{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; } def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), "rol{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>; + [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), "rol{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, + [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize; def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>; + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))], + IIC_SR>; def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>; + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; // Rotate by 1 def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), "rol{b}\t$dst", - [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>; + [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))], + IIC_SR>; def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1), "rol{w}\t$dst", - [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize; + [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))], + IIC_SR>, OpSize; def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1), "rol{l}\t$dst", - [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>; + [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))], + IIC_SR>; def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "rol{q}\t$dst", - [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>; + [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))], + IIC_SR>; } // Constraints = "$src = $dst" let Uses = [CL] in { def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst), "rol{b}\t{%cl, $dst|$dst, CL}", - [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>; + [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst), "rol{w}\t{%cl, $dst|$dst, CL}", - [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize; + [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize; def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), "rol{l}\t{%cl, $dst|$dst, CL}", - [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>; + [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>; def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), - "rol{q}\t{%cl, $dst|$dst, %CL}", - [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>; + "rol{q}\t{%cl, $dst|$dst, %cl}", + [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; } def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1), "rol{b}\t{$src1, $dst|$dst, $src1}", - [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)]>; + [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>; def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1), "rol{w}\t{$src1, $dst|$dst, $src1}", - [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)]>, + [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>, OpSize; def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1), "rol{l}\t{$src1, $dst|$dst, $src1}", - [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)]>; + [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>; def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1), "rol{q}\t{$src1, $dst|$dst, $src1}", - [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)]>; + [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>; // Rotate by 1 def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst), "rol{b}\t$dst", - [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst), "rol{w}\t$dst", - [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize; def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst), "rol{l}\t$dst", - [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), "rol{q}\t$dst", - [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; let Constraints = "$src1 = $dst" in { let Uses = [CL] in { def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "ror{b}\t{%cl, $dst|$dst, CL}", - [(set GR8:$dst, (rotr GR8:$src1, CL))]>; + [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>; def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "ror{w}\t{%cl, $dst|$dst, CL}", - [(set GR16:$dst, (rotr GR16:$src1, CL))]>, OpSize; + [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize; def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "ror{l}\t{%cl, $dst|$dst, CL}", - [(set GR32:$dst, (rotr GR32:$src1, CL))]>; + [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>; def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), - "ror{q}\t{%cl, $dst|$dst, %CL}", - [(set GR64:$dst, (rotr GR64:$src1, CL))]>; + "ror{q}\t{%cl, $dst|$dst, CL}", + [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; } def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), "ror{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>; + [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>; def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), "ror{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, + [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize; def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "ror{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>; + [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], + IIC_SR>; def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "ror{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>; + [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; // Rotate by 1 def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "ror{b}\t$dst", - [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>; + [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))], + IIC_SR>; def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "ror{w}\t$dst", - [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize; + [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))], + IIC_SR>, OpSize; def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "ror{l}\t$dst", - [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>; + [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))], + IIC_SR>; def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "ror{q}\t$dst", - [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>; + [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))], + IIC_SR>; } // Constraints = "$src = $dst" let Uses = [CL] in { def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst), "ror{b}\t{%cl, $dst|$dst, CL}", - [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>; + [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), "ror{w}\t{%cl, $dst|$dst, CL}", - [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize; + [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize; def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), "ror{l}\t{%cl, $dst|$dst, CL}", - [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>; + [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>; def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), - "ror{q}\t{%cl, $dst|$dst, %CL}", - [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>; + "ror{q}\t{%cl, $dst|$dst, CL}", + [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; } def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src), "ror{b}\t{$src, $dst|$dst, $src}", - [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src), "ror{w}\t{$src, $dst|$dst, $src}", - [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize; def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src), "ror{l}\t{$src, $dst|$dst, $src}", - [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src), "ror{q}\t{$src, $dst|$dst, $src}", - [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; // Rotate by 1 def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), "ror{b}\t$dst", - [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst), "ror{w}\t$dst", - [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize; def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), "ror{l}\t$dst", - [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t$dst", - [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; //===----------------------------------------------------------------------===// @@ -601,30 +684,36 @@ let Uses = [CL] in { def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", - [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>, + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))], + IIC_SHD16_REG_CL>, TB, OpSize; def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", - [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>, + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))], + IIC_SHD16_REG_CL>, TB, OpSize; def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", - [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, TB; + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))], + IIC_SHD32_REG_CL>, TB; def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", - [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, TB; + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))], + IIC_SHD32_REG_CL>, TB; def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}", - [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>, + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))], + IIC_SHD64_REG_CL>, TB; def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}", - [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))], + IIC_SHD64_REG_CL>, TB; } @@ -634,42 +723,42 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg, (ins GR16:$src1, GR16:$src2, i8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, - (i8 imm:$src3)))]>, + (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize; def SHRD16rri8 : Ii8<0xAC, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, - (i8 imm:$src3)))]>, + (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize; def SHLD32rri8 : Ii8<0xA4, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, - (i8 imm:$src3)))]>, + (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB; def SHRD32rri8 : Ii8<0xAC, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, - (i8 imm:$src3)))]>, + (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB; def SHLD64rri8 : RIi8<0xA4, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, - (i8 imm:$src3)))]>, + (i8 imm:$src3)))], IIC_SHD64_REG_IM>, TB; def SHRD64rri8 : RIi8<0xAC, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, - (i8 imm:$src3)))]>, + (i8 imm:$src3)))], IIC_SHD64_REG_IM>, TB; } } // Constraints = "$src = $dst" @@ -678,69 +767,110 @@ let Uses = [CL] in { def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), - addr:$dst)]>, TB, OpSize; + addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), - addr:$dst)]>, TB, OpSize; + addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), - addr:$dst)]>, TB; + addr:$dst)], IIC_SHD32_MEM_CL>, TB; def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), - addr:$dst)]>, TB; + addr:$dst)], IIC_SHD32_MEM_CL>, TB; def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}", + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), - addr:$dst)]>, TB; + addr:$dst)], IIC_SHD64_MEM_CL>, TB; def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}", + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), - addr:$dst)]>, TB; + addr:$dst)], IIC_SHD64_MEM_CL>, TB; } def SHLD16mri8 : Ii8<0xA4, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, - (i8 imm:$src3)), addr:$dst)]>, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD16_MEM_IM>, TB, OpSize; def SHRD16mri8 : Ii8<0xAC, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, - (i8 imm:$src3)), addr:$dst)]>, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD16_MEM_IM>, TB, OpSize; def SHLD32mri8 : Ii8<0xA4, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi32 addr:$dst), GR32:$src2, - (i8 imm:$src3)), addr:$dst)]>, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD32_MEM_IM>, TB; def SHRD32mri8 : Ii8<0xAC, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, - (i8 imm:$src3)), addr:$dst)]>, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD32_MEM_IM>, TB; def SHLD64mri8 : RIi8<0xA4, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, - (i8 imm:$src3)), addr:$dst)]>, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD64_MEM_IM>, TB; def SHRD64mri8 : RIi8<0xAC, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, - (i8 imm:$src3)), addr:$dst)]>, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD64_MEM_IM>, TB; } // Defs = [EFLAGS] +multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { +let neverHasSideEffects = 1 in { + def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TAXD, VEX; + let mayLoad = 1 in + def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, i8imm:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TAXD, VEX; +} +} + +multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> { +let neverHasSideEffects = 1 in { + def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + VEX_4VOp3; + let mayLoad = 1 in + def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + VEX_4VOp3; +} +} + +let Predicates = [HasBMI2] in { + defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>; + defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W; + defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS; + defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W; + defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD; + defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W; + defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, OpSize; + defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, OpSize, VEX_W; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td index 05a5b36..bddba6c 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -1,10 +1,10 @@ -//===- X86InstrSystem.td - System Instructions -------------*- tablegen -*-===// -// +//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the X86 instructions that are generally used in @@ -45,18 +45,17 @@ def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; -def SYSRETL : I<0x07, RawFrm, (outs), (ins), "sysretl", []>, TB; -def SYSRETQ :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB, +def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB; +def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", []>, TB, Requires<[In64BitMode]>; def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB; -def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit", []>, TB, - Requires<[In32BitMode]>; -def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit", []>, TB, +def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB; +def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB, Requires<[In64BitMode]>; -def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iretw", []>, OpSize; +def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize; def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>; def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>, Requires<[In64BitMode]>; @@ -215,18 +214,18 @@ def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB; def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins), - "str{w}\t{$dst}", []>, TB, OpSize; + "str{w}\t$dst", []>, TB, OpSize; def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), - "str{l}\t{$dst}", []>, TB; + "str{l}\t$dst", []>, TB; def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), - "str{q}\t{$dst}", []>, TB; + "str{q}\t$dst", []>, TB; def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), - "str{w}\t{$dst}", []>, TB; + "str{w}\t$dst", []>, TB; def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), - "ltr{w}\t{$src}", []>, TB; + "ltr{w}\t$src", []>, TB; def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), - "ltr{w}\t{$src}", []>, TB; + "ltr{w}\t$src", []>, TB; def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), "push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize; @@ -447,21 +446,38 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in //===----------------------------------------------------------------------===// // FS/GS Base Instructions -let Predicates = [In64BitMode] in { +let Predicates = [HasFSGSBase, In64BitMode] in { def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins), - "rdfsbase{l}\t$dst", []>, TB, XS; + "rdfsbase{l}\t$dst", + [(set GR32:$dst, (int_x86_rdfsbase_32))]>, TB, XS; def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins), - "rdfsbase{q}\t$dst", []>, TB, XS; + "rdfsbase{q}\t$dst", + [(set GR64:$dst, (int_x86_rdfsbase_64))]>, TB, XS; def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins), - "rdgsbase{l}\t$dst", []>, TB, XS; + "rdgsbase{l}\t$dst", + [(set GR32:$dst, (int_x86_rdgsbase_32))]>, TB, XS; def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins), - "rdgsbase{q}\t$dst", []>, TB, XS; - def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$dst), - "wrfsbase{l}\t$dst", []>, TB, XS; - def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$dst), - "wrfsbase{q}\t$dst", []>, TB, XS; - def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$dst), - "wrgsbase{l}\t$dst", []>, TB, XS; - def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$dst), - "wrgsbase{q}\t$dst", []>, TB, XS; + "rdgsbase{q}\t$dst", + [(set GR64:$dst, (int_x86_rdgsbase_64))]>, TB, XS; + def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src), + "wrfsbase{l}\t$src", + [(int_x86_wrfsbase_32 GR32:$src)]>, TB, XS; + def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src), + "wrfsbase{q}\t$src", + [(int_x86_wrfsbase_64 GR64:$src)]>, TB, XS; + def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src), + "wrgsbase{l}\t$src", + [(int_x86_wrgsbase_32 GR32:$src)]>, TB, XS; + def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src), + "wrgsbase{q}\t$src", + [(int_x86_wrgsbase_64 GR64:$src)]>, TB, XS; } + +//===----------------------------------------------------------------------===// +// INVPCID Instruction +def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invpcid {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In32BitMode]>; +def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invpcid {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In64BitMode]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td index 09a7a7d0c..6a8f0c8 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrVMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td @@ -1,10 +1,10 @@ -//===- X86InstrVMX.td - VMX Instruction Set Extension ------*- tablegen -*-===// -// +//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the instructions that make up the Intel VMX instruction @@ -17,18 +17,24 @@ // 66 0F 38 80 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), - "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8; + "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In32BitMode]>; def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), - "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8; + "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In64BitMode]>; // 66 0F 38 81 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), - "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8; + "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In32BitMode]>; def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), - "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8; + "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8, + Requires<[In64BitMode]>; // 0F 01 C1 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB; def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), "vmclear\t$vmcs", []>, OpSize, TB; +// OF 01 D4 +def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB; // 0F 01 C2 def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB; // 0F 01 C3 @@ -38,23 +44,23 @@ def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins), "vmptrst\t$vmcs", []>, TB; def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src), - "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB; + "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>; def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), - "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB; + "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>; def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src), - "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB; + "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>; def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), - "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB; + "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>; def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB; + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>; def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB; + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>; def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB; + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>; def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB; + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>; // 0F 01 C4 def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB; def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon), - "vmxon\t{$vmxon}", []>, XS; + "vmxon\t$vmxon", []>, XS; diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td new file mode 100644 index 0000000..65bbcb5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -0,0 +1,307 @@ +//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes XOP (eXtended OPerations) +// +//===----------------------------------------------------------------------===// + +multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, VEX; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>; + defm VFRCZPS : xop2op<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>; + defm VFRCZPD : xop2op<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>; +} + +// Scalar load 2 addr operand instructions +let Constraints = "$src1 = $dst" in { +multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, + Operand memop, ComplexPattern mem_cpat> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, + VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, + memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (Int VR128:$src1, + (bitconvert mem_cpat:$src2)))]>, VEX; +} + +} // Constraints = "$src1 = $dst" + +let isAsmParserOnly = 1 in { + defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32>; + defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64>; +} + + +multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, + PatFrag memop> { + def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L; + def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; +} + +let isAsmParserOnly = 1 in { + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, + memopv8f32>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, + memopv4f64>; +} + +multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>, + VEX_4V, VEX_W; + def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins f128mem:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>, + VEX_4VOp3; +} + +let isAsmParserOnly = 1 in { + defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; + defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; + defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; + defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; + defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; + defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; + defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; + defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; + defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; + defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; + defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +} + +multiclass xop3opimm<bits<8> opc, string OpcodeStr> { + let neverHasSideEffects = 1 in { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX; + let mayLoad = 1 in + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins f128mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, VEX; + } +} + +let isAsmParserOnly = 1 in { + defm VPROTW : xop3opimm<0xC1, "vprotw">; + defm VPROTQ : xop3opimm<0xC3, "vprotq">; + defm VPROTD : xop3opimm<0xC2, "vprotd">; + defm VPROTB : xop3opimm<0xC0, "vprotb">; +} + +// Instruction where second source can be memory, but third must be register +multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM; + def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + VR128:$src3))]>, VEX_4V, VEX_I8IMM; +} + +let isAsmParserOnly = 1 in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +} + +// Instruction where second source can be memory, third must be imm8 +multiclass xop4opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType VT> { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (VT (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, VEX_4V; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (VT (OpNode VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + imm:$src3)))]>, VEX_4V; +} + +let isAsmParserOnly = 1 in { + defm VPCOMB : xop4opimm<0xCC, "vpcomb", X86vpcom, v16i8>; + defm VPCOMW : xop4opimm<0xCD, "vpcomw", X86vpcom, v8i16>; + defm VPCOMD : xop4opimm<0xCE, "vpcomd", X86vpcom, v4i32>; + defm VPCOMQ : xop4opimm<0xCF, "vpcomq", X86vpcom, v2i64>; + defm VPCOMUB : xop4opimm<0xEC, "vpcomub", X86vpcomu, v16i8>; + defm VPCOMUW : xop4opimm<0xED, "vpcomuw", X86vpcomu, v8i16>; + defm VPCOMUD : xop4opimm<0xEE, "vpcomud", X86vpcomu, v4i32>; + defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", X86vpcomu, v2i64>; +} + +// Instruction where either second or third source can be memory +multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, + VEX_4V, VEX_I8IMM; + def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, + (bitconvert (memopv2i64 addr:$src3))))]>, + VEX_4V, VEX_I8IMM, VEX_W, MemOp4; + def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + VR128:$src3))]>, + VEX_4V, VEX_I8IMM; +} + +let isAsmParserOnly = 1 in { + defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; + defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +} + +multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>, + VEX_4V, VEX_I8IMM; + def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (Int VR256:$src1, VR256:$src2, + (bitconvert (memopv4i64 addr:$src3))))]>, + VEX_4V, VEX_I8IMM, VEX_W, MemOp4; + def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)), + VR256:$src3))]>, + VEX_4V, VEX_I8IMM; +} + +let isAsmParserOnly = 1 in { + defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +} + +multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, + Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { + def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>; + def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>, + VEX_W, MemOp4; + def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>; + def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>; + def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>, + VEX_W, MemOp4; + def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>; +} + +defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, + int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>; +defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, + int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>; + diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp index 3f88fa6..0168d12 100644 --- a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp @@ -300,7 +300,10 @@ extern "C" { SIZE(X86CompilationCallback_SSE) ); # else - void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr); + // the following function is called only from this translation unit, + // unless we are under 64bit Windows with MSC, where there is + // no support for inline assembly + static void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr); _declspec(naked) void X86CompilationCallback(void) { __asm { @@ -424,7 +427,9 @@ X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) { TargetJITInfo::LazyResolverFn X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { + TsanIgnoreWritesBegin(); JITCompilerFunction = F; + TsanIgnoreWritesEnd(); #if defined (X86_32_JIT) && !defined (_MSC_VER) if (Subtarget->hasSSE1()) @@ -569,6 +574,5 @@ char* X86JITInfo::allocateThreadLocalMemory(size_t size) { return TLSOffset; #else llvm_unreachable("Cannot allocate thread local storage on this arch!"); - return 0; #endif } diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.h b/contrib/llvm/lib/Target/X86/X86JITInfo.h index 238420c..c76d3cc 100644 --- a/contrib/llvm/lib/Target/X86/X86JITInfo.h +++ b/contrib/llvm/lib/Target/X86/X86JITInfo.h @@ -1,4 +1,4 @@ -//===- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===// +//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index 50bc14d..b578e8d 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -12,10 +12,11 @@ // //===----------------------------------------------------------------------===// -#include "InstPrinter/X86ATTInstPrinter.h" #include "X86MCInstLower.h" #include "X86AsmPrinter.h" #include "X86COFFMachineModuleInfo.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "llvm/Type.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -26,7 +27,6 @@ #include "llvm/Target/Mangler.h" #include "llvm/Support/FormattedStream.h" #include "llvm/ADT/SmallString.h" -#include "llvm/Type.h" using namespace llvm; X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf, @@ -154,6 +154,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, Ctx), Ctx); break; + case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break; case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break; case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break; case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break; @@ -230,7 +231,8 @@ static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) { /// a short fixed-register form. static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) { unsigned ImmOp = Inst.getNumOperands() - 1; - assert(Inst.getOperand(0).isReg() && Inst.getOperand(ImmOp).isImm() && + assert(Inst.getOperand(0).isReg() && + (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) && ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() && Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) || Inst.getNumOperands() == 2) && "Unexpected instruction!"); @@ -335,6 +337,9 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = LowerSymbolOperand(MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress())); break; + case MachineOperand::MO_RegisterMask: + // Ignore call clobbers. + continue; } OutMI.addOperand(MCOp); @@ -368,14 +373,12 @@ ReSimplify: case X86::SETB_C64r: LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break; case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; - case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; - case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; - case X86::VFsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; - case X86::VFsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; case X86::AVX_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break; case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break; + case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break; + case X86::AVX2_SET0: LowerUnaryToTwoAddr(OutMI, X86::VPXORYrr); break; case X86::MOV16r0: LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 @@ -386,14 +389,12 @@ ReSimplify: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr break; - // TAILJMPr64, [WIN]CALL64r, [WIN]CALL64pcrel32 - These instructions have - // register inputs modeled as normal uses instead of implicit uses. As such, - // truncate off all but the first operand (the callee). FIXME: Change isel. + // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register + // inputs modeled as normal uses instead of implicit uses. As such, truncate + // off all but the first operand (the callee). FIXME: Change isel. case X86::TAILJMPr64: case X86::CALL64r: - case X86::CALL64pcrel32: - case X86::WINCALL64r: - case X86::WINCALL64pcrel32: { + case X86::CALL64pcrel32: { unsigned Opcode = OutMI.getOpcode(); MCOperand Saved = OutMI.getOperand(0); OutMI = MCInst(); @@ -415,7 +416,7 @@ ReSimplify: case X86::TAILJMPd64: { unsigned Opcode; switch (OutMI.getOpcode()) { - default: assert(0 && "Invalid opcode"); + default: llvm_unreachable("Invalid opcode"); case X86::TAILJMPr: Opcode = X86::JMP32r; break; case X86::TAILJMPd: case X86::TAILJMPd64: Opcode = X86::JMP_1; break; @@ -527,6 +528,22 @@ ReSimplify: case X86::XOR16ri: SimplifyShortImmForm(OutMI, X86::XOR16i16); break; case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break; case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break; + + case X86::MORESTACK_RET: + OutMI.setOpcode(X86::RET); + break; + + case X86::MORESTACK_RET_RESTORE_R10: { + MCInst retInst; + + OutMI.setOpcode(X86::MOV64rr); + OutMI.addOperand(MCOperand::CreateReg(X86::R10)); + OutMI.addOperand(MCOperand::CreateReg(X86::RAX)); + + retInst.setOpcode(X86::RET); + AsmPrinter.OutStreamer.EmitInstruction(retInst); + break; + } } } diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.h b/contrib/llvm/lib/Target/X86/X86MCInstLower.h index 0210072..40df3db 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.h +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.h @@ -1,4 +1,4 @@ -//===-- X86MCInstLower.h - Lower MachineInstr to MCInst -------------------===// +//===-- X86MCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp new file mode 100644 index 0000000..568dc22 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -0,0 +1,14 @@ +//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86MachineFunctionInfo.h" + +using namespace llvm; + +void X86MachineFunctionInfo::anchor() { } diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index b0bb313..c747109 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -1,10 +1,10 @@ -//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// -// +//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file declares X86-specific per-machine-function information. @@ -21,6 +21,8 @@ namespace llvm { /// X86MachineFunctionInfo - This class is derived from MachineFunction and /// contains private X86 target-specific information for each MachineFunction. class X86MachineFunctionInfo : public MachineFunctionInfo { + virtual void anchor(); + /// ForceFramePointer - True if the function is required to use of frame /// pointer for reasons other than it containing dynamic allocation or /// that FP eliminatation is turned off. For example, Cygwin main function diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index c1ac9f3..b56025f 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1,4 +1,4 @@ -//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===// +//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===// // // The LLVM Compiler Infrastructure // @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#include "X86.h" #include "X86RegisterInfo.h" +#include "X86.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" @@ -127,121 +127,13 @@ const TargetRegisterClass * X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned SubIdx) const { - switch (SubIdx) { - default: return 0; - case X86::sub_8bit: - if (B == &X86::GR8RegClass) { - if (A->getSize() == 2 || A->getSize() == 4 || A->getSize() == 8) - return A; - } else if (B == &X86::GR8_ABCD_LRegClass || B == &X86::GR8_ABCD_HRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || - A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || - A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_ABCDRegClass; - else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass || - A == &X86::GR32_NOREXRegClass || - A == &X86::GR32_NOSPRegClass) - return &X86::GR32_ABCDRegClass; - else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass || - A == &X86::GR16_NOREXRegClass) - return &X86::GR16_ABCDRegClass; - } else if (B == &X86::GR8_NOREXRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_NOREXRegClass; - else if (A == &X86::GR64_ABCDRegClass) - return &X86::GR64_ABCDRegClass; - else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass || - A == &X86::GR32_NOSPRegClass) - return &X86::GR32_NOREXRegClass; - else if (A == &X86::GR32_ABCDRegClass) - return &X86::GR32_ABCDRegClass; - else if (A == &X86::GR16RegClass || A == &X86::GR16_NOREXRegClass) - return &X86::GR16_NOREXRegClass; - else if (A == &X86::GR16_ABCDRegClass) - return &X86::GR16_ABCDRegClass; - } - break; - case X86::sub_8bit_hi: - if (B->hasSubClassEq(&X86::GR8_ABCD_HRegClass)) - switch (A->getSize()) { - case 2: return getCommonSubClass(A, &X86::GR16_ABCDRegClass); - case 4: return getCommonSubClass(A, &X86::GR32_ABCDRegClass); - case 8: return getCommonSubClass(A, &X86::GR64_ABCDRegClass); - default: return 0; - } - break; - case X86::sub_16bit: - if (B == &X86::GR16RegClass) { - if (A->getSize() == 4 || A->getSize() == 8) - return A; - } else if (B == &X86::GR16_ABCDRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || - A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || - A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_ABCDRegClass; - else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass || - A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass) - return &X86::GR32_ABCDRegClass; - } else if (B == &X86::GR16_NOREXRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_NOREXRegClass; - else if (A == &X86::GR64_ABCDRegClass) - return &X86::GR64_ABCDRegClass; - else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass || - A == &X86::GR32_NOSPRegClass) - return &X86::GR32_NOREXRegClass; - else if (A == &X86::GR32_ABCDRegClass) - return &X86::GR64_ABCDRegClass; - } - break; - case X86::sub_32bit: - if (B == &X86::GR32RegClass) { - if (A->getSize() == 8) - return A; - } else if (B == &X86::GR32_NOSPRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_NOSPRegClass) - return &X86::GR64_NOSPRegClass; - if (A->getSize() == 8) - return getCommonSubClass(A, &X86::GR64_NOSPRegClass); - } else if (B == &X86::GR32_ABCDRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || - A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || - A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_ABCDRegClass; - } else if (B == &X86::GR32_NOREXRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass) - return &X86::GR64_NOREXRegClass; - else if (A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_NOREX_NOSPRegClass; - else if (A == &X86::GR64_ABCDRegClass) - return &X86::GR64_ABCDRegClass; - } else if (B == &X86::GR32_NOREX_NOSPRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_NOREX_NOSPRegClass; - else if (A == &X86::GR64_ABCDRegClass) - return &X86::GR64_ABCDRegClass; - } - break; - case X86::sub_ss: - if (B == &X86::FR32RegClass) - return A; - break; - case X86::sub_sd: - if (B == &X86::FR64RegClass) - return A; - break; - case X86::sub_xmm: - if (B == &X86::VR128RegClass) - return A; - break; + // The sub_8bit sub-register index is more constrained in 32-bit mode. + if (!Is64Bit && SubIdx == X86::sub_8bit) { + A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi); + if (!A) + return 0; } - return 0; + return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); } const TargetRegisterClass* @@ -334,7 +226,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } } -const unsigned * +const uint16_t * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { bool callsEHReturn = false; bool ghcCall = false; @@ -345,45 +237,29 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false); } - static const unsigned GhcCalleeSavedRegs[] = { - 0 - }; - - static const unsigned CalleeSavedRegs32Bit[] = { - X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0 - }; - - static const unsigned CalleeSavedRegs32EHRet[] = { - X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0 - }; - - static const unsigned CalleeSavedRegs64Bit[] = { - X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 - }; - - static const unsigned CalleeSavedRegs64EHRet[] = { - X86::RAX, X86::RDX, X86::RBX, X86::R12, - X86::R13, X86::R14, X86::R15, X86::RBP, 0 - }; - - static const unsigned CalleeSavedRegsWin64[] = { - X86::RBX, X86::RBP, X86::RDI, X86::RSI, - X86::R12, X86::R13, X86::R14, X86::R15, - X86::XMM6, X86::XMM7, X86::XMM8, X86::XMM9, - X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, - X86::XMM14, X86::XMM15, 0 - }; - - if (ghcCall) { - return GhcCalleeSavedRegs; - } else if (Is64Bit) { + if (ghcCall) + return CSR_Ghc_SaveList; + if (Is64Bit) { if (IsWin64) - return CalleeSavedRegsWin64; - else - return (callsEHReturn ? CalleeSavedRegs64EHRet : CalleeSavedRegs64Bit); - } else { - return (callsEHReturn ? CalleeSavedRegs32EHRet : CalleeSavedRegs32Bit); + return CSR_Win64_SaveList; + if (callsEHReturn) + return CSR_64EHRet_SaveList; + return CSR_64_SaveList; } + if (callsEHReturn) + return CSR_32EHRet_SaveList; + return CSR_32_SaveList; +} + +const uint32_t* +X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + if (CC == CallingConv::GHC) + return CSR_Ghc_RegMask; + if (!Is64Bit) + return CSR_32_RegMask; + if (IsWin64) + return CSR_Win64_RegMask; + return CSR_64_RegMask; } BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { @@ -428,16 +304,16 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (unsigned n = 0; n != 8; ++n) { // R8, R9, ... - const unsigned GPR64[] = { + static const uint16_t GPR64[] = { X86::R8, X86::R9, X86::R10, X86::R11, X86::R12, X86::R13, X86::R14, X86::R15 }; - for (const unsigned *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI) + for (const uint16_t *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI) Reserved.set(Reg); // XMM8, XMM9, ... assert(X86::XMM15 == X86::XMM8+7); - for (const unsigned *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI; + for (const uint16_t *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI; ++AI) Reserved.set(Reg); } @@ -452,7 +328,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - return (RealignStack && + return (MF.getTarget().Options.RealignStack && !MFI->hasVarSizedObjects()); } @@ -583,7 +459,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // sure we restore the stack pointer immediately after the call, there may // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. MachineBasicBlock::iterator B = MBB.begin(); - while (I != B && !llvm::prior(I)->getDesc().isCall()) + while (I != B && !llvm::prior(I)->isCall()) --I; MBB.insert(I, New); } @@ -650,12 +526,10 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { unsigned X86RegisterInfo::getEHExceptionRegister() const { llvm_unreachable("What is the exception register"); - return 0; } unsigned X86RegisterInfo::getEHHandlerRegister() const { llvm_unreachable("What is the exception handler register"); - return 0; } namespace llvm { @@ -665,7 +539,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { case MVT::i8: if (High) { switch (Reg) { - default: return 0; + default: return getX86SubSuperRegister(Reg, MVT::i64, High); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AH; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -785,6 +659,22 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { return X86::R15D; } case MVT::i64: + // For 64-bit mode if we've requested a "high" register and the + // Q or r constraints we want one of these high registers or + // just the register name otherwise. + if (High) { + switch (Reg) { + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + // Fallthrough. + } + } switch (Reg) { default: return Reg; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: @@ -821,8 +711,6 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { return X86::R15; } } - - return Reg; } } diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index 7d39c68..bee0393 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -1,4 +1,4 @@ -//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===// +//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -95,7 +95,8 @@ public: /// getCalleeSavedRegs - Return a null-terminated list of all of the /// callee-save registers on this target. - const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const; + const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const; + const uint32_t *getCallPreservedMask(CallingConv::ID) const; /// getReservedRegs - Returns a bitset indexed by physical register number /// indicating if a register is a special register that has particular uses and diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td index 9a7db36..5263a49 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -70,7 +70,7 @@ let Namespace = "X86" in { def BH : Register<"bh">; // 16-bit registers - let SubRegIndices = [sub_8bit, sub_8bit_hi] in { + let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in { def AX : RegisterWithSubRegs<"ax", [AL,AH]>; def DX : RegisterWithSubRegs<"dx", [DL,DH]>; def CX : RegisterWithSubRegs<"cx", [CL,CH]>; diff --git a/contrib/llvm/lib/Target/X86/X86Relocations.h b/contrib/llvm/lib/Target/X86/X86Relocations.h index 990962d..857becf 100644 --- a/contrib/llvm/lib/Target/X86/X86Relocations.h +++ b/contrib/llvm/lib/Target/X86/X86Relocations.h @@ -1,4 +1,4 @@ -//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===// +//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td new file mode 100644 index 0000000..17f4efd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Schedule.td @@ -0,0 +1,273 @@ +//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for X86 +def IIC_DEFAULT : InstrItinClass; +def IIC_ALU_MEM : InstrItinClass; +def IIC_ALU_NONMEM : InstrItinClass; +def IIC_LEA : InstrItinClass; +def IIC_LEA_16 : InstrItinClass; +def IIC_MUL8 : InstrItinClass; +def IIC_MUL16_MEM : InstrItinClass; +def IIC_MUL16_REG : InstrItinClass; +def IIC_MUL32_MEM : InstrItinClass; +def IIC_MUL32_REG : InstrItinClass; +def IIC_MUL64 : InstrItinClass; +// imul by al, ax, eax, tax +def IIC_IMUL8 : InstrItinClass; +def IIC_IMUL16_MEM : InstrItinClass; +def IIC_IMUL16_REG : InstrItinClass; +def IIC_IMUL32_MEM : InstrItinClass; +def IIC_IMUL32_REG : InstrItinClass; +def IIC_IMUL64 : InstrItinClass; +// imul reg by reg|mem +def IIC_IMUL16_RM : InstrItinClass; +def IIC_IMUL16_RR : InstrItinClass; +def IIC_IMUL32_RM : InstrItinClass; +def IIC_IMUL32_RR : InstrItinClass; +def IIC_IMUL64_RM : InstrItinClass; +def IIC_IMUL64_RR : InstrItinClass; +// imul reg = reg/mem * imm +def IIC_IMUL16_RMI : InstrItinClass; +def IIC_IMUL16_RRI : InstrItinClass; +def IIC_IMUL32_RMI : InstrItinClass; +def IIC_IMUL32_RRI : InstrItinClass; +def IIC_IMUL64_RMI : InstrItinClass; +def IIC_IMUL64_RRI : InstrItinClass; +// div +def IIC_DIV8_MEM : InstrItinClass; +def IIC_DIV8_REG : InstrItinClass; +def IIC_DIV16 : InstrItinClass; +def IIC_DIV32 : InstrItinClass; +def IIC_DIV64 : InstrItinClass; +// idiv +def IIC_IDIV8 : InstrItinClass; +def IIC_IDIV16 : InstrItinClass; +def IIC_IDIV32 : InstrItinClass; +def IIC_IDIV64 : InstrItinClass; +// neg/not/inc/dec +def IIC_UNARY_REG : InstrItinClass; +def IIC_UNARY_MEM : InstrItinClass; +// add/sub/and/or/xor/adc/sbc/cmp/test +def IIC_BIN_MEM : InstrItinClass; +def IIC_BIN_NONMEM : InstrItinClass; +// shift/rotate +def IIC_SR : InstrItinClass; +// shift double +def IIC_SHD16_REG_IM : InstrItinClass; +def IIC_SHD16_REG_CL : InstrItinClass; +def IIC_SHD16_MEM_IM : InstrItinClass; +def IIC_SHD16_MEM_CL : InstrItinClass; +def IIC_SHD32_REG_IM : InstrItinClass; +def IIC_SHD32_REG_CL : InstrItinClass; +def IIC_SHD32_MEM_IM : InstrItinClass; +def IIC_SHD32_MEM_CL : InstrItinClass; +def IIC_SHD64_REG_IM : InstrItinClass; +def IIC_SHD64_REG_CL : InstrItinClass; +def IIC_SHD64_MEM_IM : InstrItinClass; +def IIC_SHD64_MEM_CL : InstrItinClass; +// cmov +def IIC_CMOV16_RM : InstrItinClass; +def IIC_CMOV16_RR : InstrItinClass; +def IIC_CMOV32_RM : InstrItinClass; +def IIC_CMOV32_RR : InstrItinClass; +def IIC_CMOV64_RM : InstrItinClass; +def IIC_CMOV64_RR : InstrItinClass; +// set +def IIC_SET_R : InstrItinClass; +def IIC_SET_M : InstrItinClass; +// jmp/jcc/jcxz +def IIC_Jcc : InstrItinClass; +def IIC_JCXZ : InstrItinClass; +def IIC_JMP_REL : InstrItinClass; +def IIC_JMP_REG : InstrItinClass; +def IIC_JMP_MEM : InstrItinClass; +def IIC_JMP_FAR_MEM : InstrItinClass; +def IIC_JMP_FAR_PTR : InstrItinClass; +// loop +def IIC_LOOP : InstrItinClass; +def IIC_LOOPE : InstrItinClass; +def IIC_LOOPNE : InstrItinClass; +// call +def IIC_CALL_RI : InstrItinClass; +def IIC_CALL_MEM : InstrItinClass; +def IIC_CALL_FAR_MEM : InstrItinClass; +def IIC_CALL_FAR_PTR : InstrItinClass; +// ret +def IIC_RET : InstrItinClass; +def IIC_RET_IMM : InstrItinClass; +//sign extension movs +def IIC_MOVSX : InstrItinClass; +def IIC_MOVSX_R16_R8 : InstrItinClass; +def IIC_MOVSX_R16_M8 : InstrItinClass; +def IIC_MOVSX_R16_R16 : InstrItinClass; +def IIC_MOVSX_R32_R32 : InstrItinClass; +//zero extension movs +def IIC_MOVZX : InstrItinClass; +def IIC_MOVZX_R16_R8 : InstrItinClass; +def IIC_MOVZX_R16_M8 : InstrItinClass; + +def IIC_REP_MOVS : InstrItinClass; +def IIC_REP_STOS : InstrItinClass; + +// SSE scalar/parallel binary operations +def IIC_SSE_ALU_F32S_RR : InstrItinClass; +def IIC_SSE_ALU_F32S_RM : InstrItinClass; +def IIC_SSE_ALU_F64S_RR : InstrItinClass; +def IIC_SSE_ALU_F64S_RM : InstrItinClass; +def IIC_SSE_MUL_F32S_RR : InstrItinClass; +def IIC_SSE_MUL_F32S_RM : InstrItinClass; +def IIC_SSE_MUL_F64S_RR : InstrItinClass; +def IIC_SSE_MUL_F64S_RM : InstrItinClass; +def IIC_SSE_DIV_F32S_RR : InstrItinClass; +def IIC_SSE_DIV_F32S_RM : InstrItinClass; +def IIC_SSE_DIV_F64S_RR : InstrItinClass; +def IIC_SSE_DIV_F64S_RM : InstrItinClass; +def IIC_SSE_ALU_F32P_RR : InstrItinClass; +def IIC_SSE_ALU_F32P_RM : InstrItinClass; +def IIC_SSE_ALU_F64P_RR : InstrItinClass; +def IIC_SSE_ALU_F64P_RM : InstrItinClass; +def IIC_SSE_MUL_F32P_RR : InstrItinClass; +def IIC_SSE_MUL_F32P_RM : InstrItinClass; +def IIC_SSE_MUL_F64P_RR : InstrItinClass; +def IIC_SSE_MUL_F64P_RM : InstrItinClass; +def IIC_SSE_DIV_F32P_RR : InstrItinClass; +def IIC_SSE_DIV_F32P_RM : InstrItinClass; +def IIC_SSE_DIV_F64P_RR : InstrItinClass; +def IIC_SSE_DIV_F64P_RM : InstrItinClass; + +def IIC_SSE_COMIS_RR : InstrItinClass; +def IIC_SSE_COMIS_RM : InstrItinClass; + +def IIC_SSE_HADDSUB_RR : InstrItinClass; +def IIC_SSE_HADDSUB_RM : InstrItinClass; + +def IIC_SSE_BIT_P_RR : InstrItinClass; +def IIC_SSE_BIT_P_RM : InstrItinClass; + +def IIC_SSE_INTALU_P_RR : InstrItinClass; +def IIC_SSE_INTALU_P_RM : InstrItinClass; +def IIC_SSE_INTALUQ_P_RR : InstrItinClass; +def IIC_SSE_INTALUQ_P_RM : InstrItinClass; + +def IIC_SSE_INTMUL_P_RR : InstrItinClass; +def IIC_SSE_INTMUL_P_RM : InstrItinClass; + +def IIC_SSE_INTSH_P_RR : InstrItinClass; +def IIC_SSE_INTSH_P_RM : InstrItinClass; +def IIC_SSE_INTSH_P_RI : InstrItinClass; + +def IIC_SSE_CMPP_RR : InstrItinClass; +def IIC_SSE_CMPP_RM : InstrItinClass; + +def IIC_SSE_SHUFP : InstrItinClass; +def IIC_SSE_PSHUF : InstrItinClass; + +def IIC_SSE_UNPCK : InstrItinClass; + +def IIC_SSE_MOVMSK : InstrItinClass; +def IIC_SSE_MASKMOV : InstrItinClass; + +def IIC_SSE_PEXTRW : InstrItinClass; +def IIC_SSE_PINSRW : InstrItinClass; + +def IIC_SSE_PABS_RR : InstrItinClass; +def IIC_SSE_PABS_RM : InstrItinClass; + +def IIC_SSE_SQRTP_RR : InstrItinClass; +def IIC_SSE_SQRTP_RM : InstrItinClass; +def IIC_SSE_SQRTS_RR : InstrItinClass; +def IIC_SSE_SQRTS_RM : InstrItinClass; + +def IIC_SSE_RCPP_RR : InstrItinClass; +def IIC_SSE_RCPP_RM : InstrItinClass; +def IIC_SSE_RCPS_RR : InstrItinClass; +def IIC_SSE_RCPS_RM : InstrItinClass; + +def IIC_SSE_MOV_S_RR : InstrItinClass; +def IIC_SSE_MOV_S_RM : InstrItinClass; +def IIC_SSE_MOV_S_MR : InstrItinClass; + +def IIC_SSE_MOVA_P_RR : InstrItinClass; +def IIC_SSE_MOVA_P_RM : InstrItinClass; +def IIC_SSE_MOVA_P_MR : InstrItinClass; + +def IIC_SSE_MOVU_P_RR : InstrItinClass; +def IIC_SSE_MOVU_P_RM : InstrItinClass; +def IIC_SSE_MOVU_P_MR : InstrItinClass; + +def IIC_SSE_MOVDQ : InstrItinClass; +def IIC_SSE_MOVD_ToGP : InstrItinClass; +def IIC_SSE_MOVQ_RR : InstrItinClass; + +def IIC_SSE_MOV_LH : InstrItinClass; + +def IIC_SSE_LDDQU : InstrItinClass; + +def IIC_SSE_MOVNT : InstrItinClass; + +def IIC_SSE_PHADDSUBD_RR : InstrItinClass; +def IIC_SSE_PHADDSUBD_RM : InstrItinClass; +def IIC_SSE_PHADDSUBSW_RR : InstrItinClass; +def IIC_SSE_PHADDSUBSW_RM : InstrItinClass; +def IIC_SSE_PHADDSUBW_RR : InstrItinClass; +def IIC_SSE_PHADDSUBW_RM : InstrItinClass; +def IIC_SSE_PSHUFB_RR : InstrItinClass; +def IIC_SSE_PSHUFB_RM : InstrItinClass; +def IIC_SSE_PSIGN_RR : InstrItinClass; +def IIC_SSE_PSIGN_RM : InstrItinClass; + +def IIC_SSE_PMADD : InstrItinClass; +def IIC_SSE_PMULHRSW : InstrItinClass; +def IIC_SSE_PALIGNR : InstrItinClass; +def IIC_SSE_MWAIT : InstrItinClass; +def IIC_SSE_MONITOR : InstrItinClass; + +def IIC_SSE_PREFETCH : InstrItinClass; +def IIC_SSE_PAUSE : InstrItinClass; +def IIC_SSE_LFENCE : InstrItinClass; +def IIC_SSE_MFENCE : InstrItinClass; +def IIC_SSE_SFENCE : InstrItinClass; +def IIC_SSE_LDMXCSR : InstrItinClass; +def IIC_SSE_STMXCSR : InstrItinClass; + +def IIC_SSE_CVT_PD_RR : InstrItinClass; +def IIC_SSE_CVT_PD_RM : InstrItinClass; +def IIC_SSE_CVT_PS_RR : InstrItinClass; +def IIC_SSE_CVT_PS_RM : InstrItinClass; +def IIC_SSE_CVT_PI2PS_RR : InstrItinClass; +def IIC_SSE_CVT_PI2PS_RM : InstrItinClass; +def IIC_SSE_CVT_Scalar_RR : InstrItinClass; +def IIC_SSE_CVT_Scalar_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass; +def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass; +def IIC_SSE_CVT_SD2SI_RM : InstrItinClass; +def IIC_SSE_CVT_SD2SI_RR : InstrItinClass; + +def IIC_CMPX_LOCK : InstrItinClass; +def IIC_CMPX_LOCK_8 : InstrItinClass; +def IIC_CMPX_LOCK_8B : InstrItinClass; +def IIC_CMPX_LOCK_16B : InstrItinClass; + +def IIC_XADD_LOCK_MEM : InstrItinClass; +def IIC_XADD_LOCK_MEM8 : InstrItinClass; + + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +def GenericItineraries : ProcessorItineraries<[], [], []>; + +include "X86ScheduleAtom.td" + + + diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td new file mode 100644 index 0000000..77d4e56 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -0,0 +1,305 @@ +//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Intel Atom (Bonnell) +// processors. +// +//===----------------------------------------------------------------------===// + +// +// Scheduling information derived from the "Intel 64 and IA32 Architectures +// Optimization Reference Manual", Chapter 13, Section 4. +// Functional Units +// Port 0 +def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store + // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide +def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA + // SIMD/FP: SIMD ALU, FP Adder + +def AtomItineraries : ProcessorItineraries< + [ Port0, Port1 ], + [], [ + // P0 only + // InstrItinData<class, [InstrStage<N, [P0]>] >, + // P0 or P1 + // InstrItinData<class, [InstrStage<N, [P0, P1]>] >, + // P0 and P1 + // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >, + // + // Default is 1 cycle, port0 or port1 + InstrItinData<IIC_DEFAULT, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >, + // mul + InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >, + // imul by al, ax, eax, rax + InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >, + // imul reg by reg|mem + InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >, + // imul reg = reg/mem * imm + InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >, + // idiv + InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >, + // div + InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >, + InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >, + // neg/not/inc/dec + InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >, + // add/sub/and/or/xor/adc/sbc/cmp/test + InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >, + // shift/rotate + InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >, + // shift double + InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >, + // cmov + InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >, + // set + InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >, + // jcc + InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >, + // jcxz/jecxz/jrcxz + InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >, + // jmp rel + InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >, + // jmp indirect + InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >, + // jmp far + InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >, + // loop/loope/loopne + InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >, + // call - all but reg/imm + InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >, + InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >, + InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >, + //ret + InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >, + InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >, + //sign extension movs + InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >, + InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >, + //zero extension movs + InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >, + InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >, + + // SSE binary operations + // arithmetic fp scalar + InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >, + + // arithmetic fp parallel + InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >, + + // bitwise parallel + InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >, + + // arithmetic int parallel + InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >, + + // multiply int parallel + InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >, + + // shift parallel + InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >, + + // conversions + // to/from PD ... + InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >, + // to/from PS except to/from PD and PS2PI + InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >, + + InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >, + + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] > + ]>; + diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index 6406bce..9a04e35 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -65,7 +65,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false, - 0, CallingConv::C, false, /*isReturnValueUsed=*/false, + 0, CallingConv::C, /*isTailCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed=*/false, DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); return CallResult.second; diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index 7064dd0..452dd7e 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -21,7 +21,6 @@ #include "llvm/Support/Host.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/ADT/SmallVector.h" #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR @@ -177,16 +176,18 @@ unsigned X86Subtarget::getSpecialAddressLatency() const { void X86Subtarget::AutoDetectSubtargetFeatures() { unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; + unsigned MaxLevel; union { unsigned u[3]; char c[12]; } text; - - if (X86_MC::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) + + if (X86_MC::GetCpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) || + MaxLevel < 1) return; X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX); - + if ((EDX >> 15) & 1) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); } if ((EDX >> 23) & 1) { X86SSELevel = MMX; ToggleFeature(X86::FeatureMMX); } if ((EDX >> 25) & 1) { X86SSELevel = SSE1; ToggleFeature(X86::FeatureSSE1); } @@ -196,7 +197,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);} if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);} // FIXME: AVX codegen support is not ready. - //if ((ECX >> 28) & 1) { HasAVX = true; ToggleFeature(X86::FeatureAVX); } + //if ((ECX >> 28) & 1) { X86SSELevel = AVX; ToggleFeature(X86::FeatureAVX); } bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0; bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0; @@ -244,28 +245,69 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { IsBTMemSlow = true; ToggleFeature(X86::FeatureSlowBTMem); } + // If it's Nehalem, unaligned memory access is fast. + // FIXME: Nehalem is family 6. Also include Westmere and later processors? if (Family == 15 && Model == 26) { IsUAMemFast = true; ToggleFeature(X86::FeatureFastUAMem); } - X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); - if ((EDX >> 29) & 0x1) { - HasX86_64 = true; - ToggleFeature(X86::Feature64Bit); - } - if ((ECX >> 5) & 0x1) { - HasLZCNT = true; - ToggleFeature(X86::FeatureLZCNT); + // Set processor type. Currently only Atom is detected. + if (Family == 6 && Model == 28) { + X86ProcFamily = IntelAtom; + ToggleFeature(X86::FeatureLeaForSP); } - if (IsAMD && ((ECX >> 6) & 0x1)) { - HasSSE4A = true; - ToggleFeature(X86::FeatureSSE4A); + + unsigned MaxExtLevel; + X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX); + + if (MaxExtLevel >= 0x80000001) { + X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); + if ((EDX >> 29) & 0x1) { + HasX86_64 = true; + ToggleFeature(X86::Feature64Bit); + } + if ((ECX >> 5) & 0x1) { + HasLZCNT = true; + ToggleFeature(X86::FeatureLZCNT); + } + if (IsAMD) { + if ((ECX >> 6) & 0x1) { + HasSSE4A = true; + ToggleFeature(X86::FeatureSSE4A); + } + if ((ECX >> 11) & 0x1) { + HasXOP = true; + ToggleFeature(X86::FeatureXOP); + } + if ((ECX >> 16) & 0x1) { + HasFMA4 = true; + ToggleFeature(X86::FeatureFMA4); + } + } } - if (IsAMD && ((ECX >> 16) & 0x1)) { - HasFMA4 = true; - ToggleFeature(X86::FeatureFMA4); + } + + if (IsIntel && MaxLevel >= 7) { + if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) { + if (EBX & 0x1) { + HasFSGSBase = true; + ToggleFeature(X86::FeatureFSGSBase); + } + if ((EBX >> 3) & 0x1) { + HasBMI = true; + ToggleFeature(X86::FeatureBMI); + } + // FIXME: AVX2 codegen support is not ready. + //if ((EBX >> 5) & 0x1) { + // X86SSELevel = AVX2; + // ToggleFeature(X86::FeatureAVX2); + //} + if ((EBX >> 8) & 0x1) { + HasBMI2 = true; + ToggleFeature(X86::FeatureBMI2); + } } } } @@ -274,6 +316,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, const std::string &FS, unsigned StackAlignOverride, bool is64Bit) : X86GenSubtargetInfo(TT, CPU, FS) + , X86ProcFamily(Others) , PICStyle(PICStyles::None) , X86SSELevel(NoMMXSSE) , X863DNowLevel(NoThreeDNow) @@ -281,31 +324,35 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, , HasX86_64(false) , HasPOPCNT(false) , HasSSE4A(false) - , HasAVX(false) , HasAES(false) , HasCLMUL(false) , HasFMA3(false) , HasFMA4(false) + , HasXOP(false) , HasMOVBE(false) , HasRDRAND(false) , HasF16C(false) + , HasFSGSBase(false) , HasLZCNT(false) , HasBMI(false) + , HasBMI2(false) , IsBTMemSlow(false) , IsUAMemFast(false) , HasVectorUAMem(false) , HasCmpxchg16b(false) - , stackAlignment(8) + , UseLeaForSP(false) + , PostRAScheduler(false) + , stackAlignment(4) // FIXME: this is a known good value for Yonah. How about others? , MaxInlineSizeThreshold(128) , TargetTriple(TT) - , In64BitMode(is64Bit) - , InNaClMode(false) { + , In64BitMode(is64Bit) { // Determine default and user specified characteristics + std::string CPUName = CPU; if (!FS.empty() || !CPU.empty()) { - std::string CPUName = CPU; if (CPUName.empty()) { -#if defined (__x86_64__) || defined(__i386__) +#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\ + || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) CPUName = sys::getHostCPUName(); #else CPUName = "generic"; @@ -325,6 +372,13 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, // If feature string is not empty, parse features string. ParseSubtargetFeatures(CPUName, FullFS); } else { + if (CPUName.empty()) { +#if defined (__x86_64__) || defined(__i386__) + CPUName = sys::getHostCPUName(); +#else + CPUName = "generic"; +#endif + } // Otherwise, use CPUID to auto-detect feature set. AutoDetectSubtargetFeatures(); @@ -333,7 +387,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, HasX86_64 = true; ToggleFeature(X86::Feature64Bit); HasCMov = true; ToggleFeature(X86::FeatureCMOV); - if (!HasAVX && X86SSELevel < SSE2) { + if (X86SSELevel < SSE2) { X86SSELevel = SSE2; ToggleFeature(X86::FeatureSSE1); ToggleFeature(X86::FeatureSSE2); @@ -341,28 +395,22 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, } } + if (X86ProcFamily == IntelAtom) { + PostRAScheduler = true; + InstrItins = getInstrItineraryForCPU(CPUName); + } + // It's important to keep the MCSubtargetInfo feature bits in sync with // target data structure which is shared with MC code emitter, etc. if (In64BitMode) ToggleFeature(X86::Mode64Bit); - if (isTargetNaCl()) { - InNaClMode = true; - ToggleFeature(X86::ModeNaCl); - } - - if (HasAVX) - X86SSELevel = NoMMXSSE; - DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel << ", 3DNowLevel " << X863DNowLevel << ", 64bit " << HasX86_64 << "\n"); assert((!In64BitMode || HasX86_64) && "64-bit code requested on a subtarget that doesn't support it!"); - if(EnableSegmentedStacks && !isTargetELF()) - report_fatal_error("Segmented stacks are only implemented on ELF."); - // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both // 32 and 64 bit) and for all 64-bit targets. if (StackAlignOverride) @@ -371,3 +419,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, isTargetSolaris() || In64BitMode) stackAlignment = 16; } + +bool X86Subtarget::enablePostRAScheduler( + CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const { + Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL; + CriticalPathRCs.clear(); + return PostRAScheduler && OptLevel >= CodeGenOpt::Default; +} diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index 3258d3d..7fd832b 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -1,4 +1,4 @@ -//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====// +//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===// // // The LLVM Compiler Infrastructure // @@ -14,9 +14,9 @@ #ifndef X86SUBTARGET_H #define X86SUBTARGET_H +#include "llvm/CallingConv.h" #include "llvm/ADT/Triple.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include "llvm/CallingConv.h" #include <string> #define GET_SUBTARGETINFO_HEADER @@ -42,13 +42,20 @@ enum Style { class X86Subtarget : public X86GenSubtargetInfo { protected: enum X86SSEEnum { - NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42 + NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2 }; enum X863DNowEnum { NoThreeDNow, ThreeDNow, ThreeDNowA }; + enum X86ProcFamilyEnum { + Others, IntelAtom + }; + + /// X86ProcFamily - X86 processor family: Intel Atom, and others + X86ProcFamilyEnum X86ProcFamily; + /// PICStyle - Which PIC style to use /// PICStyles::Style PICStyle; @@ -75,9 +82,6 @@ protected: /// HasSSE4A - True if the processor supports SSE4A instructions. bool HasSSE4A; - /// HasAVX - Target has AVX instructions - bool HasAVX; - /// HasAES - Target has AES instructions bool HasAES; @@ -90,6 +94,9 @@ protected: /// HasFMA4 - Target has 4-operand fused multiply-add bool HasFMA4; + /// HasXOP - Target has XOP instructions + bool HasXOP; + /// HasMOVBE - True if the processor has the MOVBE instruction. bool HasMOVBE; @@ -99,12 +106,18 @@ protected: /// HasF16C - Processor has 16-bit floating point conversion instructions. bool HasF16C; + /// HasFSGSBase - Processor has FS/GS base insturctions. + bool HasFSGSBase; + /// HasLZCNT - Processor has LZCNT instruction. bool HasLZCNT; /// HasBMI - Processor has BMI1 instructions. bool HasBMI; + /// HasBMI2 - Processor has BMI2 instructions. + bool HasBMI2; + /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; @@ -119,6 +132,13 @@ protected: /// this is true for most x86-64 chips, but not the first AMD chips. bool HasCmpxchg16b; + /// UseLeaForSP - True if the LEA instruction should be used for adjusting + /// the stack pointer. This is an optimization for Intel Atom processors. + bool UseLeaForSP; + + /// PostRAScheduler - True if using post-register-allocation scheduler. + bool PostRAScheduler; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -129,14 +149,14 @@ protected: /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; + + /// Instruction itineraries for scheduling + InstrItineraryData InstrItins; private: /// In64BitMode - True if compiling for 64-bit, false for 32-bit. bool In64BitMode; - /// InNaClMode - True if compiling for Native Client target. - bool InNaClMode; - public: /// This constructor initializes the data members to match that @@ -176,26 +196,31 @@ public: bool hasSSSE3() const { return X86SSELevel >= SSSE3; } bool hasSSE41() const { return X86SSELevel >= SSE41; } bool hasSSE42() const { return X86SSELevel >= SSE42; } + bool hasAVX() const { return X86SSELevel >= AVX; } + bool hasAVX2() const { return X86SSELevel >= AVX2; } bool hasSSE4A() const { return HasSSE4A; } bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasPOPCNT() const { return HasPOPCNT; } - bool hasAVX() const { return HasAVX; } - bool hasXMM() const { return hasSSE1() || hasAVX(); } - bool hasXMMInt() const { return hasSSE2() || hasAVX(); } bool hasAES() const { return HasAES; } bool hasCLMUL() const { return HasCLMUL; } bool hasFMA3() const { return HasFMA3; } bool hasFMA4() const { return HasFMA4; } + bool hasXOP() const { return HasXOP; } bool hasMOVBE() const { return HasMOVBE; } bool hasRDRAND() const { return HasRDRAND; } bool hasF16C() const { return HasF16C; } + bool hasFSGSBase() const { return HasFSGSBase; } bool hasLZCNT() const { return HasLZCNT; } bool hasBMI() const { return HasBMI; } + bool hasBMI2() const { return HasBMI2; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool hasVectorUAMem() const { return HasVectorUAMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } + bool useLeaForSP() const { return UseLeaForSP; } + + bool isAtom() const { return X86ProcFamily == IntelAtom; } const Triple &getTargetTriple() const { return TargetTriple; } @@ -209,38 +234,28 @@ public: // ELF is a reasonably sane default and the only other X86 targets we // support are Darwin and Windows. Just use "not those". - bool isTargetELF() const { - return !isTargetDarwin() && !isTargetWindows() && !isTargetCygMing(); - } + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } bool isTargetNaCl() const { return TargetTriple.getOS() == Triple::NativeClient; } bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } - bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; } bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; } bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; } - bool isTargetCygMing() const { - return isTargetMingw() || isTargetCygwin(); - } - - /// isTargetCOFF - Return true if this is any COFF/Windows target variant. - bool isTargetCOFF() const { - return isTargetMingw() || isTargetCygwin() || isTargetWindows(); - } + bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } + bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } + bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); } bool isTargetWin64() const { // FIXME: x86_64-cygwin has not been released yet. - return In64BitMode && (isTargetCygMing() || isTargetWindows()); - } - - bool isTargetEnvMacho() const { - return isTargetDarwin() || (TargetTriple.getEnvironment() == Triple::MachO); + return In64BitMode && TargetTriple.isOSWindows(); } bool isTargetWin32() const { + // FIXME: Cygwin is included for isTargetWin64 -- should it be included + // here too? return !In64BitMode && (isTargetMingw() || isTargetWindows()); } @@ -286,6 +301,15 @@ public: /// indicating the number of scheduling cycles of backscheduling that /// should be attempted. unsigned getSpecialAddressLatency() const; + + /// enablePostRAScheduler - run for Atom optimization. + bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, + TargetSubtargetInfo::AntiDepBreakMode& Mode, + RegClassVector& CriticalPathRCs) const; + + /// getInstrItins = Return the instruction itineraries based on the + /// subtarget selection. + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index 15c6c4e..f4b7a62 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -28,11 +28,14 @@ extern "C" void LLVMInitializeX86Target() { RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target); } +void X86_32TargetMachine::anchor() { } X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - Reloc::Model RM, CodeModel::Model CM) - : X86TargetMachine(T, TT, CPU, FS, RM, CM, false), + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false), DataLayout(getSubtargetImpl()->isTargetDarwin() ? "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-" "n8:16:32-S128" : @@ -48,11 +51,14 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, JITInfo(*this) { } +void X86_64TargetMachine::anchor() { } X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - Reloc::Model RM, CodeModel::Model CM) - : X86TargetMachine(T, TT, CPU, FS, RM, CM, true), + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true), DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-" "n8:16:32:64-S128"), InstrInfo(*this), @@ -65,12 +71,15 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, /// X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool is64Bit) - : LLVMTargetMachine(T, TT, CPU, FS, RM, CM), - Subtarget(TT, CPU, FS, StackAlignmentOverride, is64Bit), + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit), FrameLowering(*this, Subtarget), - ELFWriterInfo(is64Bit, true) { + ELFWriterInfo(is64Bit, true), + InstrItins(Subtarget.getInstrItineraryData()){ // Determine the PICStyle based on the target selected. if (getRelocationModel() == Reloc::Static) { // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. @@ -92,8 +101,8 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, } // default to hard float ABI - if (FloatABIType == FloatABI::Default) - FloatABIType = FloatABI::Hard; + if (Options.FloatABIType == FloatABI::Default) + this->Options.FloatABIType = FloatABI::Hard; } //===----------------------------------------------------------------------===// @@ -102,46 +111,67 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, static cl::opt<bool> UseVZeroUpper("x86-use-vzeroupper", cl::desc("Minimize AVX to SSE transition penalty"), - cl::init(false)); + cl::init(true)); //===----------------------------------------------------------------------===// // Pass Pipeline Configuration //===----------------------------------------------------------------------===// -bool X86TargetMachine::addInstSelector(PassManagerBase &PM, - CodeGenOpt::Level OptLevel) { +namespace { +/// X86 Code Generator Pass Configuration Options. +class X86PassConfig : public TargetPassConfig { +public: + X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + X86TargetMachine &getX86TargetMachine() const { + return getTM<X86TargetMachine>(); + } + + const X86Subtarget &getX86Subtarget() const { + return *getX86TargetMachine().getSubtargetImpl(); + } + + virtual bool addInstSelector(); + virtual bool addPreRegAlloc(); + virtual bool addPostRegAlloc(); + virtual bool addPreEmitPass(); +}; +} // namespace + +TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { + return new X86PassConfig(this, PM); +} + +bool X86PassConfig::addInstSelector() { // Install an instruction selector. - PM.add(createX86ISelDag(*this, OptLevel)); + PM.add(createX86ISelDag(getX86TargetMachine(), getOptLevel())); // For 32-bit, prepend instructions to set the "global base reg" for PIC. - if (!Subtarget.is64Bit()) + if (!getX86Subtarget().is64Bit()) PM.add(createGlobalBaseRegPass()); return false; } -bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM, - CodeGenOpt::Level OptLevel) { +bool X86PassConfig::addPreRegAlloc() { PM.add(createX86MaxStackAlignmentHeuristicPass()); return false; // -print-machineinstr shouldn't print after this. } -bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM, - CodeGenOpt::Level OptLevel) { +bool X86PassConfig::addPostRegAlloc() { PM.add(createX86FloatingPointStackifierPass()); return true; // -print-machineinstr should print after this. } -bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM, - CodeGenOpt::Level OptLevel) { +bool X86PassConfig::addPreEmitPass() { bool ShouldPrint = false; - if (OptLevel != CodeGenOpt::None && - (Subtarget.hasSSE2() || Subtarget.hasAVX())) { + if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) { PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass)); ShouldPrint = true; } - if (Subtarget.hasAVX() && UseVZeroUpper) { + if (getX86Subtarget().hasAVX() && UseVZeroUpper) { PM.add(createX86IssueVZeroUpperPass()); ShouldPrint = true; } @@ -150,7 +180,6 @@ bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM, } bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, - CodeGenOpt::Level OptLevel, JITCodeEmitter &JCE) { PM.add(createX86JITCodeEmitterPass(*this, JCE)); diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h index d1569aa..8e935af 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.h +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -27,19 +27,20 @@ #include "llvm/Target/TargetFrameLowering.h" namespace llvm { - -class formatted_raw_ostream; + class StringRef; class X86TargetMachine : public LLVMTargetMachine { - X86Subtarget Subtarget; - X86FrameLowering FrameLowering; - X86ELFWriterInfo ELFWriterInfo; + X86Subtarget Subtarget; + X86FrameLowering FrameLowering; + X86ELFWriterInfo ELFWriterInfo; + InstrItineraryData InstrItins; public: - X86TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, + X86TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool is64Bit); virtual const X86InstrInfo *getInstrInfo() const { @@ -55,7 +56,7 @@ public: virtual const X86TargetLowering *getTargetLowering() const { llvm_unreachable("getTargetLowering not implemented"); } - virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { llvm_unreachable("getSelectionDAGInfo not implemented"); } virtual const X86RegisterInfo *getRegisterInfo() const { @@ -64,19 +65,21 @@ public: virtual const X86ELFWriterInfo *getELFWriterInfo() const { return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; } + virtual const InstrItineraryData *getInstrItineraryData() const { + return &InstrItins; + } // Set up the pass pipeline. - virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); - virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); - virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); - virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); - virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, + virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); + + virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE); }; /// X86_32TargetMachine - X86 32-bit target machine. /// class X86_32TargetMachine : public X86TargetMachine { + virtual void anchor(); const TargetData DataLayout; // Calculates type size & alignment X86InstrInfo InstrInfo; X86SelectionDAGInfo TSInfo; @@ -84,13 +87,14 @@ class X86_32TargetMachine : public X86TargetMachine { X86JITInfo JITInfo; public: X86_32TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - Reloc::Model RM, CodeModel::Model CM); + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); virtual const TargetData *getTargetData() const { return &DataLayout; } virtual const X86TargetLowering *getTargetLowering() const { return &TLInfo; } - virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } virtual const X86InstrInfo *getInstrInfo() const { @@ -104,6 +108,7 @@ public: /// X86_64TargetMachine - X86 64-bit target machine. /// class X86_64TargetMachine : public X86TargetMachine { + virtual void anchor(); const TargetData DataLayout; // Calculates type size & alignment X86InstrInfo InstrInfo; X86SelectionDAGInfo TSInfo; @@ -111,13 +116,14 @@ class X86_64TargetMachine : public X86TargetMachine { X86JITInfo JITInfo; public: X86_64TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - Reloc::Model RM, CodeModel::Model CM); + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); virtual const TargetData *getTargetData() const { return &DataLayout; } virtual const X86TargetLowering *getTargetLowering() const { return &TLInfo; } - virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } virtual const X86InstrInfo *getInstrInfo() const { diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp index 991f322..718f35e 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -1,4 +1,4 @@ -//===-- llvm/Target/X86/X86TargetObjectFile.cpp - X86 Object Info ---------===// +//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===// // // The LLVM Compiler Infrastructure // @@ -14,7 +14,6 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Target/Mangler.h" -#include "llvm/ADT/SmallString.h" #include "llvm/Support/Dwarf.h" using namespace llvm; using namespace dwarf; diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h index d7adf27..a02a368 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -1,4 +1,4 @@ -//===-- llvm/Target/X86/X86TargetObjectFile.h - X86 Object Info -*- C++ -*-===// +//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -15,7 +15,6 @@ #include "llvm/Target/TargetLoweringObjectFile.h" namespace llvm { - class X86TargetMachine; /// X8664_MachoTargetObjectFile - This TLOF implementation is used for Darwin /// x86-64. diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp index 3958494..2fd78a7 100644 --- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -14,14 +14,16 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-codegen" +#define DEBUG_TYPE "x86-vzeroupper" #include "X86.h" #include "X86InstrInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/GlobalValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; @@ -41,6 +43,60 @@ namespace { private: const TargetInstrInfo *TII; // Machine instruction info. MachineBasicBlock *MBB; // Current basic block + + // Any YMM register live-in to this function? + bool FnHasLiveInYmm; + + // BBState - Contains the state of each MBB: unknown, clean, dirty + SmallVector<uint8_t, 8> BBState; + + // BBSolved - Keep track of all MBB which had been already analyzed + // and there is no further processing required. + BitVector BBSolved; + + // Machine Basic Blocks are classified according this pass: + // + // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state + // until the MBB exit there isn't a instruction using YMM to change + // the state to dirty, or one of the incoming predecessors is unknown + // and there's not a dirty predecessor between them. + // + // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have + // instructions using YMM and be marked ST_CLEAN, as long as the state + // is cleaned by a vzeroupper before any call. + // + // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a + // vzeroupper instruction. + // + // ST_INIT - Placeholder for an empty state set + // + enum { + ST_UNKNOWN = 0, + ST_CLEAN = 1, + ST_DIRTY = 2, + ST_INIT = 3 + }; + + // computeState - Given two states, compute the resulting state, in + // the following way + // + // 1) One dirty state yields another dirty state + // 2) All states must be clean for the result to be clean + // 3) If none above and one unknown, the result state is also unknown + // + unsigned computeState(unsigned PrevState, unsigned CurState) { + if (PrevState == ST_INIT) + return CurState; + + if (PrevState == ST_DIRTY || CurState == ST_DIRTY) + return ST_DIRTY; + + if (PrevState == ST_CLEAN && CurState == ST_CLEAN) + return ST_CLEAN; + + return ST_UNKNOWN; + } + }; char VZeroUpperInserter::ID = 0; } @@ -49,37 +105,82 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); } +static bool isYmmReg(unsigned Reg) { + if (Reg >= X86::YMM0 && Reg <= X86::YMM15) + return true; + + return false; +} + +static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { + for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), + E = MRI.livein_end(); I != E; ++I) + if (isYmmReg(I->first)) + return true; + + return false; +} + +static bool hasYmmReg(MachineInstr *MI) { + for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + if (MO.isDebug()) + continue; + if (isYmmReg(MO.getReg())) + return true; + } + return false; +} + /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// vzero upper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { TII = MF.getTarget().getInstrInfo(); - bool Changed = false; - - // Process any unreachable blocks in arbitrary order now. - for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) - Changed |= processBasicBlock(MF, *BB); + MachineRegisterInfo &MRI = MF.getRegInfo(); + bool EverMadeChange = false; - return Changed; -} + // Fast check: if the function doesn't use any ymm registers, we don't need + // to insert any VZEROUPPER instructions. This is constant-time, so it is + // cheap in the common case of no ymm use. + bool YMMUsed = false; + const TargetRegisterClass *RC = X86::VR256RegisterClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); + i != e; i++) { + if (MRI.isPhysRegUsed(*i)) { + YMMUsed = true; + break; + } + } + if (!YMMUsed) + return EverMadeChange; -static bool isCallToModuleFn(const MachineInstr *MI) { - assert(MI->getDesc().isCall() && "Isn't a call instruction"); + // Pre-compute the existence of any live-in YMM registers to this function + FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); - for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + assert(BBState.empty()); + BBState.resize(MF.getNumBlockIDs(), 0); + BBSolved.resize(MF.getNumBlockIDs(), 0); - if (!MO.isGlobal()) - continue; + // Each BB state depends on all predecessors, loop over until everything + // converges. (Once we converge, we can implicitly mark everything that is + // still ST_UNKNOWN as ST_CLEAN.) + while (1) { + bool MadeChange = false; - const GlobalValue *GV = MO.getGlobal(); - GlobalValue::LinkageTypes LT = GV->getLinkage(); - if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) || - (GV->isExternalLinkage(LT) && !GV->isDeclaration())) - return true; + // Process all basic blocks. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + MadeChange |= processBasicBlock(MF, *I); - return false; + // If this iteration over the code changed anything, keep iterating. + if (!MadeChange) break; + EverMadeChange = true; } - return false; + + BBState.clear(); + BBSolved.clear(); + return EverMadeChange; } /// processBasicBlock - Loop over all of the instructions in the basic block, @@ -87,19 +188,98 @@ static bool isCallToModuleFn(const MachineInstr *MI) { bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; + unsigned BBNum = BB.getNumber(); MBB = &BB; + // Don't process already solved BBs + if (BBSolved[BBNum]) + return false; // No changes + + // Check the state of all predecessors + unsigned EntryState = ST_INIT; + for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), + PE = BB.pred_end(); PI != PE; ++PI) { + EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); + if (EntryState == ST_DIRTY) + break; + } + + + // The entry MBB for the function may set the inital state to dirty if + // the function receives any YMM incoming arguments + if (MBB == MF.begin()) { + EntryState = ST_CLEAN; + if (FnHasLiveInYmm) + EntryState = ST_DIRTY; + } + + // The current state is initialized according to the predecessors + unsigned CurState = EntryState; + bool BBHasCall = false; + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { MachineInstr *MI = I; DebugLoc dl = I->getDebugLoc(); + bool isControlFlow = MI->isCall() || MI->isReturn(); + + // Shortcut: don't need to check regular instructions in dirty state. + if (!isControlFlow && CurState == ST_DIRTY) + continue; + + if (hasYmmReg(MI)) { + // We found a ymm-using instruction; this could be an AVX instruction, + // or it could be control flow. + CurState = ST_DIRTY; + continue; + } - // Insert a vzeroupper instruction before each control transfer - // to functions outside this module - if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) { - BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); - ++NumVZU; + // Check for control-flow out of the current function (which might + // indirectly execute SSE instructions). + if (!isControlFlow) + continue; + + BBHasCall = true; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX + // registers. This instruction has zero latency. In addition, the processor + // changes back to Clean state, after which execution of Intel SSE + // instructions or Intel AVX instructions has no transition penalty. Add + // the VZEROUPPER instruction before any function call/return that might + // execute SSE code. + // FIXME: In some cases, we may want to move the VZEROUPPER into a + // predecessor block. + if (CurState == ST_DIRTY) { + // Only insert the VZEROUPPER in case the entry state isn't unknown. + // When unknown, only compute the information within the block to have + // it available in the exit if possible, but don't change the block. + if (EntryState != ST_UNKNOWN) { + BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + } + + // After the inserted VZEROUPPER the state becomes clean again, but + // other YMM may appear before other subsequent calls or even before + // the end of the BB. + CurState = ST_CLEAN; } } + DEBUG(dbgs() << "MBB #" << BBNum + << ", current state: " << CurState << '\n'); + + // A BB can only be considered solved when we both have done all the + // necessary transformations, and have computed the exit state. This happens + // in two cases: + // 1) We know the entry state: this immediately implies the exit state and + // all the necessary transformations. + // 2) There are no calls, and and a non-call instruction marks this block: + // no transformations are necessary, and we know the exit state. + if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) + BBSolved[BBNum] = true; + + if (CurState != BBState[BBNum]) + Changed = true; + + BBState[BBNum] = CurState; return Changed; } |