diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
73 files changed, 12983 insertions, 4253 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 09e316c..bc8f367 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -9,6 +9,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "llvm/ADT/APFloat.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSwitch.h" @@ -79,7 +80,7 @@ private: PostfixStack.push_back(std::make_pair(Op, Val)); } - void popOperator() { InfixOperatorStack.pop_back_val(); } + void popOperator() { InfixOperatorStack.pop_back(); } void pushOperator(InfixCalculatorTok Op) { // Push the new operator if the stack is empty. if (InfixOperatorStack.empty()) { @@ -117,12 +118,12 @@ private: if (StackOp == IC_RPAREN) { ++ParenCount; - InfixOperatorStack.pop_back_val(); + InfixOperatorStack.pop_back(); } else if (StackOp == IC_LPAREN) { --ParenCount; - InfixOperatorStack.pop_back_val(); + InfixOperatorStack.pop_back(); } else { - InfixOperatorStack.pop_back_val(); + InfixOperatorStack.pop_back(); PostfixStack.push_back(std::make_pair(StackOp, 0)); } } @@ -219,7 +220,9 @@ private: const MCExpr *getSym() { return Sym; } StringRef getSymName() { return SymName; } int64_t getImm() { return Imm + IC.execute(); } - bool isValidEndState() { return State == IES_RBRAC; } + bool isValidEndState() { + return State == IES_RBRAC || State == IES_INTEGER; + } bool getStopOnLBrac() { return StopOnLBrac; } bool getAddImmPrefix() { return AddImmPrefix; } bool hadError() { return State == IES_ERROR; } @@ -492,16 +495,17 @@ private: X86Operand *ParseATTOperand(); X86Operand *ParseIntelOperand(); X86Operand *ParseIntelOffsetOfOperator(); - X86Operand *ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); + bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); X86Operand *ParseIntelOperator(unsigned OpKind); - X86Operand *ParseIntelMemOperand(unsigned SegReg, int64_t ImmDisp, - SMLoc StartLoc); - X86Operand *ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); + X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); + X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, + unsigned Size); + bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp, unsigned Size); - X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, - InlineAsmIdentifierInfo &Info, - bool IsUnevaluatedOperand, SMLoc &End); + bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, SMLoc &End); X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); @@ -552,8 +556,9 @@ private: /// } public: - X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser) - : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) { + X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, + const MCInstrInfo &MII) + : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) { // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -811,6 +816,9 @@ struct X86Operand : public MCParsedAsmOperand { bool isMem256() const { return Kind == Memory && (!Mem.Size || Mem.Size == 256); } + bool isMem512() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 512); + } bool isMemVX32() const { return Kind == Memory && (!Mem.Size || Mem.Size == 32) && @@ -828,14 +836,45 @@ struct X86Operand : public MCParsedAsmOperand { return Kind == Memory && (!Mem.Size || Mem.Size == 64) && getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; } + bool isMemVZ32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; + } + bool isMemVZ64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; + } bool isAbsMem() const { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && !getMemIndexReg() && getMemScale() == 1; } + bool isMemOffs8() const { + return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && + !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8); + } + bool isMemOffs16() const { + return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && + !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16); + } + bool isMemOffs32() const { + return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && + !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32); + } + bool isMemOffs64() const { + return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && + !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64); + } + bool isReg() const { return Kind == Register; } + bool isGR32orGR64() const { + return Kind == Register && + (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) || + X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg())); + } + void addExpr(MCInst &Inst, const MCExpr *Expr) const { // Add as immediates when possible. if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) @@ -849,43 +888,40 @@ struct X86Operand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::CreateReg(getReg())); } - void addImmOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - addExpr(Inst, getImm()); + static unsigned getGR32FromGR64(unsigned RegNo) { + switch (RegNo) { + default: llvm_unreachable("Unexpected register"); + case X86::RAX: return X86::EAX; + case X86::RCX: return X86::ECX; + case X86::RDX: return X86::EDX; + case X86::RBX: return X86::EBX; + case X86::RBP: return X86::EBP; + case X86::RSP: return X86::ESP; + case X86::RSI: return X86::ESI; + case X86::RDI: return X86::EDI; + case X86::R8: return X86::R8D; + case X86::R9: return X86::R9D; + case X86::R10: return X86::R10D; + case X86::R11: return X86::R11D; + case X86::R12: return X86::R12D; + case X86::R13: return X86::R13D; + case X86::R14: return X86::R14D; + case X86::R15: return X86::R15D; + case X86::RIP: return X86::EIP; + } } - void addMem8Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMem16Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMem32Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMem64Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMem80Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMem128Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMem256Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMemVX32Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMemVY32Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); - } - void addMemVX64Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + void addGR32orGR64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned RegNo = getReg(); + if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo)) + RegNo = getGR32FromGR64(RegNo); + Inst.addOperand(MCOperand::CreateReg(RegNo)); } - void addMemVY64Operands(MCInst &Inst, unsigned N) const { - addMemOperands(Inst, N); + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); } void addMemOperands(MCInst &Inst, unsigned N) const { @@ -906,6 +942,15 @@ struct X86Operand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::CreateExpr(getMemDisp())); } + void addMemOffsOperands(MCInst &Inst, unsigned N) const { + assert((N == 1) && "Invalid number of operands!"); + // Add as immediates when possible. + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp())) + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + else + Inst.addOperand(MCOperand::CreateExpr(getMemDisp())); + } + static X86Operand *CreateToken(StringRef Str, SMLoc Loc) { SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size()); X86Operand *Res = new X86Operand(Token, Loc, EndLoc); @@ -1194,6 +1239,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, } } assert (Found && "Unable to rewrite ImmDisp."); + (void)Found; } else { // We have a symbolic and an immediate displacement, but no displacement // before the bracketed expression. Put the immediate displacement @@ -1223,8 +1269,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, } } -X86Operand * -X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { +bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { const AsmToken &Tok = Parser.getTok(); bool Done = false; @@ -1246,7 +1291,7 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { Done = true; break; } - return ErrorOperand(Tok.getLoc(), "Unexpected token!"); + return Error(Tok.getLoc(), "unknown token in expression"); } case AsmToken::EndOfStatement: { Done = true; @@ -1265,18 +1310,18 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } else { if (!isParsingInlineAsm()) { if (getParser().parsePrimaryExpr(Val, End)) - return ErrorOperand(Tok.getLoc(), "Unexpected identifier!"); + return Error(Tok.getLoc(), "Unexpected identifier!"); } else { InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); - if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated*/ false, End)) - return Err; + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return true; } SM.onIdentifierExpr(Val, Identifier); UpdateLocLex = false; break; } - return ErrorOperand(Tok.getLoc(), "Unexpected identifier!"); + return Error(Tok.getLoc(), "Unexpected identifier!"); } case AsmToken::Integer: if (isParsingInlineAsm() && SM.getAddImmPrefix()) @@ -1294,14 +1339,14 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { case AsmToken::RParen: SM.onRParen(); break; } if (SM.hadError()) - return ErrorOperand(Tok.getLoc(), "Unexpected token!"); + return Error(Tok.getLoc(), "unknown token in expression"); if (!Done && UpdateLocLex) { End = Tok.getLoc(); Parser.Lex(); // Consume the token. } } - return 0; + return false; } X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, @@ -1318,8 +1363,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, // may have already parsed an immediate displacement before the bracketed // expression. IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true); - if (X86Operand *Err = ParseIntelExpression(SM, End)) - return Err; + if (ParseIntelExpression(SM, End)) + return 0; const MCExpr *Disp; if (const MCExpr *Sym = SM.getSym()) { @@ -1337,8 +1382,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, // Parse the dot operator (e.g., [ebx].foo.bar). if (Tok.getString().startswith(".")) { const MCExpr *NewDisp; - if (X86Operand *Err = ParseIntelDotOperator(Disp, NewDisp)) - return Err; + if (ParseIntelDotOperator(Disp, NewDisp)) + return 0; End = Tok.getEndLoc(); Parser.Lex(); // Eat the field. @@ -1366,11 +1411,10 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, } // Inline assembly may use variable names with namespace alias qualifiers. -X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, - StringRef &Identifier, - InlineAsmIdentifierInfo &Info, - bool IsUnevaluatedOperand, - SMLoc &End) { +bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, + StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, SMLoc &End) { assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); Val = 0; @@ -1395,68 +1439,89 @@ X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext()); - return 0; + return false; } -/// ParseIntelMemOperand - Parse intel style memory operand. -X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, - int64_t ImmDisp, - SMLoc Start) { - const AsmToken &Tok = Parser.getTok(); - SMLoc End; - - unsigned Size = getIntelMemOperandSize(Tok.getString()); - if (Size) { - Parser.Lex(); // Eat operand size (e.g., byte, word). - if (Tok.getString() != "PTR" && Tok.getString() != "ptr") - return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!"); - Parser.Lex(); // Eat ptr. - } - - // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. +/// \brief Parse intel style segment override. +X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, + SMLoc Start, + unsigned Size) { + assert(SegReg != 0 && "Tried to parse a segment override without a segment!"); + const AsmToken &Tok = Parser.getTok(); // Eat colon. + if (Tok.isNot(AsmToken::Colon)) + return ErrorOperand(Tok.getLoc(), "Expected ':' token!"); + Parser.Lex(); // Eat ':' + + int64_t ImmDisp = 0; if (getLexer().is(AsmToken::Integer)) { + ImmDisp = Tok.getIntVal(); + AsmToken ImmDispToken = Parser.Lex(); // Eat the integer. + if (isParsingInlineAsm()) - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, - Tok.getLoc())); - int64_t ImmDisp = Tok.getIntVal(); - Parser.Lex(); // Eat the integer. - if (getLexer().isNot(AsmToken::LBrac)) - return ErrorOperand(Start, "Expected '[' token!"); - return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); + InstInfo->AsmRewrites->push_back( + AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc())); + + if (getLexer().isNot(AsmToken::LBrac)) { + // An immediate following a 'segment register', 'colon' token sequence can + // be followed by a bracketed expression. If it isn't we know we have our + // final segment override. + const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext()); + return X86Operand::CreateMem(SegReg, Disp, /*BaseReg=*/0, /*IndexReg=*/0, + /*Scale=*/1, Start, ImmDispToken.getEndLoc(), + Size); + } } if (getLexer().is(AsmToken::LBrac)) return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); - if (!ParseRegister(SegReg, Start, End)) { - // Handel SegReg : [ ... ] - if (getLexer().isNot(AsmToken::Colon)) - return ErrorOperand(Start, "Expected ':' token!"); - Parser.Lex(); // Eat : - if (getLexer().isNot(AsmToken::LBrac)) - return ErrorOperand(Start, "Expected '[' token!"); - return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); + const MCExpr *Val; + SMLoc End; + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return ErrorOperand(Tok.getLoc(), "unknown token in expression"); + + return X86Operand::CreateMem(Val, Start, End, Size); } + InlineAsmIdentifierInfo Info; + StringRef Identifier = Tok.getString(); + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return 0; + return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0, + /*Scale=*/1, Start, End, Size, Identifier, Info); +} + +/// ParseIntelMemOperand - Parse intel style memory operand. +X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, + unsigned Size) { + const AsmToken &Tok = Parser.getTok(); + SMLoc End; + + // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. + if (getLexer().is(AsmToken::LBrac)) + return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size); + const MCExpr *Val; if (!isParsingInlineAsm()) { if (getParser().parsePrimaryExpr(Val, End)) - return ErrorOperand(Tok.getLoc(), "Unexpected token!"); + return ErrorOperand(Tok.getLoc(), "unknown token in expression"); return X86Operand::CreateMem(Val, Start, End, Size); } InlineAsmIdentifierInfo Info; StringRef Identifier = Tok.getString(); - if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated*/ false, End)) - return Err; - return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0, + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return 0; + return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1, Start, End, Size, Identifier, Info); } /// Parse the '.' operator. -X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, +bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp) { const AsmToken &Tok = Parser.getTok(); int64_t OrigDispVal, DotDispVal; @@ -1465,7 +1530,7 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) OrigDispVal = OrigDisp->getValue(); else - return ErrorOperand(Tok.getLoc(), "Non-constant offsets are not supported!"); + return Error(Tok.getLoc(), "Non-constant offsets are not supported!"); // Drop the '.'. StringRef DotDispStr = Tok.getString().drop_front(1); @@ -1480,10 +1545,10 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second, DotDisp)) - return ErrorOperand(Tok.getLoc(), "Unable to lookup field reference!"); + return Error(Tok.getLoc(), "Unable to lookup field reference!"); DotDispVal = DotDisp; } else - return ErrorOperand(Tok.getLoc(), "Unexpected token type!"); + return Error(Tok.getLoc(), "Unexpected token type!"); if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); @@ -1494,7 +1559,7 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, } NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext()); - return 0; + return false; } /// Parse the 'offset' operator. This operator is used to specify the @@ -1508,9 +1573,9 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() { InlineAsmIdentifierInfo Info; SMLoc Start = Tok.getLoc(), End; StringRef Identifier = Tok.getString(); - if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated*/ false, End)) - return Err; + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return 0; // Don't emit the offset operator. InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7)); @@ -1544,9 +1609,12 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { InlineAsmIdentifierInfo Info; SMLoc Start = Tok.getLoc(), End; StringRef Identifier = Tok.getString(); - if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated*/ true, End)) - return Err; + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/true, End)) + return 0; + + if (!Info.OpDecl) + return ErrorOperand(Start, "unable to lookup expression"); unsigned CVal = 0; switch(OpKind) { @@ -1567,7 +1635,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) { X86Operand *X86AsmParser::ParseIntelOperand() { const AsmToken &Tok = Parser.getTok(); - SMLoc Start = Tok.getLoc(), End; + SMLoc Start, End; // Offset, length, type and size operators. if (isParsingInlineAsm()) { @@ -1582,14 +1650,23 @@ X86Operand *X86AsmParser::ParseIntelOperand() { return ParseIntelOperator(IOK_TYPE); } + unsigned Size = getIntelMemOperandSize(Tok.getString()); + if (Size) { + Parser.Lex(); // Eat operand size (e.g., byte, word). + if (Tok.getString() != "PTR" && Tok.getString() != "ptr") + return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!"); + Parser.Lex(); // Eat ptr. + } + Start = Tok.getLoc(); + // Immediate. if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) || getLexer().is(AsmToken::LParen)) { AsmToken StartTok = Tok; IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, /*AddImmPrefix=*/false); - if (X86Operand *Err = ParseIntelExpression(SM, End)) - return Err; + if (ParseIntelExpression(SM, End)) + return 0; int64_t Imm = SM.getImm(); if (isParsingInlineAsm()) { @@ -1613,23 +1690,22 @@ X86Operand *X86AsmParser::ParseIntelOperand() { "before bracketed expr."); // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. - return ParseIntelMemOperand(/*SegReg=*/0, Imm, Start); + return ParseIntelMemOperand(Imm, Start, Size); } // Register. unsigned RegNo = 0; if (!ParseRegister(RegNo, Start, End)) { // If this is a segment register followed by a ':', then this is the start - // of a memory reference, otherwise this is a normal register reference. + // of a segment override, otherwise this is a normal register reference. if (getLexer().isNot(AsmToken::Colon)) return X86Operand::CreateReg(RegNo, Start, End); - getParser().Lex(); // Eat the colon. - return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start); + return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size); } // Memory operand. - return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start); + return ParseIntelMemOperand(/*Disp=*/0, Start, Size); } X86Operand *X86AsmParser::ParseATTOperand() { @@ -1941,6 +2017,47 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, } } + if (STI.getFeatureBits() & X86::FeatureAVX512) { + // Parse mask register {%k1} + if (getLexer().is(AsmToken::LCurly)) { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(X86Operand::CreateToken("{", Loc)); + Parser.Lex(); // Eat the { + if (X86Operand *Op = ParseOperand()) { + Operands.push_back(Op); + if (!getLexer().is(AsmToken::RCurly)) { + SMLoc Loc = getLexer().getLoc(); + Parser.eatToEndOfStatement(); + return Error(Loc, "Expected } at this point"); + } + Loc = Parser.getTok().getLoc(); + Operands.push_back(X86Operand::CreateToken("}", Loc)); + Parser.Lex(); // Eat the } + } else { + Parser.eatToEndOfStatement(); + return true; + } + } + // Parse "zeroing non-masked" semantic {z} + if (getLexer().is(AsmToken::LCurly)) { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(X86Operand::CreateToken("{z}", Loc)); + Parser.Lex(); // Eat the { + if (!getLexer().is(AsmToken::Identifier) || getLexer().getTok().getIdentifier() != "z") { + SMLoc Loc = getLexer().getLoc(); + Parser.eatToEndOfStatement(); + return Error(Loc, "Expected z at this point"); + } + Parser.Lex(); // Eat the z + if (!getLexer().is(AsmToken::RCurly)) { + SMLoc Loc = getLexer().getLoc(); + Parser.eatToEndOfStatement(); + return Error(Loc, "Expected } at this point"); + } + Parser.Lex(); // Eat the } + } + } + if (getLexer().isNot(AsmToken::EndOfStatement)) { SMLoc Loc = getLexer().getLoc(); Parser.eatToEndOfStatement(); @@ -2192,6 +2309,55 @@ processInstruction(MCInst &Inst, case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8); case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8); case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8); + case X86::VMOVAPDrr: + case X86::VMOVAPDYrr: + case X86::VMOVAPSrr: + case X86::VMOVAPSYrr: + case X86::VMOVDQArr: + case X86::VMOVDQAYrr: + case X86::VMOVDQUrr: + case X86::VMOVDQUYrr: + case X86::VMOVUPDrr: + case X86::VMOVUPDYrr: + case X86::VMOVUPSrr: + case X86::VMOVUPSYrr: { + if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) || + !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg())) + return false; + + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + } + Inst.setOpcode(NewOpc); + return true; + } + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) || + !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg())) + return false; + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break; + case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break; + } + Inst.setOpcode(NewOpc); + return true; + } } } diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index ca6f80c..903e36c 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -190,94 +190,8 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, uint64_t Address, uint64_t Offset, uint64_t Width, MCInst &MI, const MCDisassembler *Dis) { - LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback(); - struct LLVMOpInfo1 SymbolicOp; - memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); - SymbolicOp.Value = Value; - void *DisInfo = Dis->getDisInfoBlock(); - - if (!getOpInfo || - !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) { - // Clear SymbolicOp.Value from above and also all other fields. - memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); - LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); - if (!SymbolLookUp) - return false; - uint64_t ReferenceType; - if (isBranch) - ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; - else - ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; - const char *ReferenceName; - const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address, - &ReferenceName); - if (Name) { - SymbolicOp.AddSymbol.Name = Name; - SymbolicOp.AddSymbol.Present = true; - } - // For branches always create an MCExpr so it gets printed as hex address. - else if (isBranch) { - SymbolicOp.Value = Value; - } - if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub) - (*Dis->CommentStream) << "symbol stub for: " << ReferenceName; - if (!Name && !isBranch) - return false; - } - - MCContext *Ctx = Dis->getMCContext(); - const MCExpr *Add = NULL; - if (SymbolicOp.AddSymbol.Present) { - if (SymbolicOp.AddSymbol.Name) { - StringRef Name(SymbolicOp.AddSymbol.Name); - MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); - Add = MCSymbolRefExpr::Create(Sym, *Ctx); - } else { - Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx); - } - } - - const MCExpr *Sub = NULL; - if (SymbolicOp.SubtractSymbol.Present) { - if (SymbolicOp.SubtractSymbol.Name) { - StringRef Name(SymbolicOp.SubtractSymbol.Name); - MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name); - Sub = MCSymbolRefExpr::Create(Sym, *Ctx); - } else { - Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx); - } - } - - const MCExpr *Off = NULL; - if (SymbolicOp.Value != 0) - Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx); - - const MCExpr *Expr; - if (Sub) { - const MCExpr *LHS; - if (Add) - LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx); - else - LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx); - if (Off != 0) - Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx); - else - Expr = LHS; - } else if (Add) { - if (Off != 0) - Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx); - else - Expr = Add; - } else { - if (Off != 0) - Expr = Off; - else - Expr = MCConstantExpr::Create(0, *Ctx); - } - - MI.addOperand(MCOperand::CreateExpr(Expr)); - - return true; + return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch, + Offset, Width); } /// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being @@ -290,15 +204,7 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value, const void *Decoder) { const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); - LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback(); - if (SymbolLookUp) { - void *DisInfo = Dis->getDisInfoBlock(); - uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load; - const char *ReferenceName; - (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName); - if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) - (*Dis->CommentStream) << "literal pool for: " << ReferenceName; - } + Dis->tryAddingPcLoadReferenceComment(Value, Address); } /// translateImmediate - Appends an immediate operand to an MCInst. @@ -325,16 +231,18 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, default: break; case 1: - type = TYPE_MOFFS8; + if(immediate & 0x80) + immediate |= ~(0xffull); break; case 2: - type = TYPE_MOFFS16; + if(immediate & 0x8000) + immediate |= ~(0xffffull); break; case 4: - type = TYPE_MOFFS32; + if(immediate & 0x80000000) + immediate |= ~(0xffffffffull); break; case 8: - type = TYPE_MOFFS64; break; } } @@ -357,16 +265,18 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri && Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri && Opcode != X86::VINSERTPSrr) - type = TYPE_MOFFS8; + if(immediate & 0x80) + immediate |= ~(0xffull); break; case ENCODING_IW: - type = TYPE_MOFFS16; + if(immediate & 0x8000) + immediate |= ~(0xffffull); break; case ENCODING_ID: - type = TYPE_MOFFS32; + if(immediate & 0x80000000) + immediate |= ~(0xffffffffull); break; case ENCODING_IO: - type = TYPE_MOFFS64; break; } } @@ -380,33 +290,27 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case TYPE_XMM256: mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4))); return; + case TYPE_XMM512: + mcInst.addOperand(MCOperand::CreateReg(X86::ZMM0 + (immediate >> 4))); + return; case TYPE_REL8: isBranch = true; pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; - // fall through to sign extend the immediate if needed. - case TYPE_MOFFS8: if(immediate & 0x80) immediate |= ~(0xffull); break; - case TYPE_MOFFS16: - if(immediate & 0x8000) - immediate |= ~(0xffffull); - break; case TYPE_REL32: case TYPE_REL64: isBranch = true; pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; - // fall through to sign extend the immediate if needed. - case TYPE_MOFFS32: if(immediate & 0x80000000) immediate |= ~(0xffffffffull); break; - case TYPE_MOFFS64: default: // operand is 64 bits wide. Do nothing. break; } - + if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation, insn.immediateOffset, insn.immediateSize, mcInst, Dis)) @@ -537,6 +441,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, EA_BASES_64BIT REGS_XMM REGS_YMM + REGS_ZMM #undef ENTRY } } else { @@ -659,6 +564,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM64: case TYPE_XMM128: case TYPE_XMM256: + case TYPE_XMM512: case TYPE_DEBUGREG: case TYPE_CONTROLREG: return translateRMRegister(mcInst, insn); @@ -777,6 +683,15 @@ static bool translateInstruction(MCInst &mcInst, } mcInst.setOpcode(insn.instructionID); + // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 + // prefix bytes should be disassembled as xrelease and xacquire then set the + // opcode to those instead of the rep and repne opcodes. + if (insn.xAcquireRelease) { + if(mcInst.getOpcode() == X86::REP_PREFIX) + mcInst.setOpcode(X86::XRELEASE_PREFIX); + else if(mcInst.getOpcode() == X86::REPNE_PREFIX) + mcInst.setOpcode(X86::XACQUIRE_PREFIX); + } int index; diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index e40edba..c81a857 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -25,8 +25,6 @@ #define TRUE 1 #define FALSE 0 -typedef int8_t bool; - #ifndef NDEBUG #define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0) #else @@ -81,6 +79,15 @@ static int modRMRequired(OpcodeType type, case THREEBYTE_A7: decision = &THREEBYTEA7_SYM; break; + case XOP8_MAP: + decision = &XOP8_MAP_SYM; + break; + case XOP9_MAP: + decision = &XOP9_MAP_SYM; + break; + case XOPA_MAP: + decision = &XOPA_MAP_SYM; + break; } return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. @@ -122,6 +129,15 @@ static InstrUID decode(OpcodeType type, case THREEBYTE_A7: dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; break; + case XOP8_MAP: + dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case XOP9_MAP: + dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case XOPA_MAP: + dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; } switch (dec->modrm_type) { @@ -305,6 +321,7 @@ static int readPrefixes(struct InternalInstruction* insn) { BOOL prefixGroups[4] = { FALSE }; uint64_t prefixLocation; uint8_t byte = 0; + uint8_t nextByte; BOOL hasAdSize = FALSE; BOOL hasOpSize = FALSE; @@ -314,20 +331,42 @@ static int readPrefixes(struct InternalInstruction* insn) { while (isPrefix) { prefixLocation = insn->readerCursor; + /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */ if (consumeByte(insn, &byte)) - return -1; + break; /* * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then * break and let it be disassembled as a normal "instruction". */ + if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) + break; + if (insn->readerCursor - 1 == insn->startLocation - && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) { - uint8_t nextByte; - if (byte == 0xf0) - break; - if (lookAtByte(insn, &nextByte)) - return -1; + && (byte == 0xf2 || byte == 0xf3) + && !lookAtByte(insn, &nextByte)) + { + /* + * If the byte is 0xf2 or 0xf3, and any of the following conditions are + * met: + * - it is followed by a LOCK (0xf0) prefix + * - it is followed by an xchg instruction + * then it should be disassembled as a xacquire/xrelease not repne/rep. + */ + if ((byte == 0xf2 || byte == 0xf3) && + ((nextByte == 0xf0) | + ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) + insn->xAcquireRelease = TRUE; + /* + * Also if the byte is 0xf3, and the following condition is met: + * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or + * "mov mem, imm" (opcode 0xc6/0xc7) instructions. + * then it should be disassembled as an xrelease not rep. + */ + if (byte == 0xf3 && + (nextByte == 0x88 || nextByte == 0x89 || + nextByte == 0xc6 || nextByte == 0xc7)) + insn->xAcquireRelease = TRUE; if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) { if (consumeByte(insn, &nextByte)) return -1; @@ -405,7 +444,7 @@ static int readPrefixes(struct InternalInstruction* insn) { dbgprintf(insn, "Found prefix 0x%hhx", byte); } - insn->vexSize = 0; + insn->vexXopType = TYPE_NO_VEX_XOP; if (byte == 0xc4) { uint8_t byte1; @@ -416,7 +455,7 @@ static int readPrefixes(struct InternalInstruction* insn) { } if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { - insn->vexSize = 3; + insn->vexXopType = TYPE_VEX_3B; insn->necessaryPrefixLocation = insn->readerCursor - 1; } else { @@ -424,22 +463,22 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->necessaryPrefixLocation = insn->readerCursor - 1; } - if (insn->vexSize == 3) { - insn->vexPrefix[0] = byte; - consumeByte(insn, &insn->vexPrefix[1]); - consumeByte(insn, &insn->vexPrefix[2]); + if (insn->vexXopType == TYPE_VEX_3B) { + insn->vexXopPrefix[0] = byte; + consumeByte(insn, &insn->vexXopPrefix[1]); + consumeByte(insn, &insn->vexXopPrefix[2]); /* We simulate the REX prefix for simplicity's sake */ if (insn->mode == MODE_64BIT) { insn->rexPrefix = 0x40 - | (wFromVEX3of3(insn->vexPrefix[2]) << 3) - | (rFromVEX2of3(insn->vexPrefix[1]) << 2) - | (xFromVEX2of3(insn->vexPrefix[1]) << 1) - | (bFromVEX2of3(insn->vexPrefix[1]) << 0); + | (wFromVEX3of3(insn->vexXopPrefix[2]) << 3) + | (rFromVEX2of3(insn->vexXopPrefix[1]) << 2) + | (xFromVEX2of3(insn->vexXopPrefix[1]) << 1) + | (bFromVEX2of3(insn->vexXopPrefix[1]) << 0); } - switch (ppFromVEX3of3(insn->vexPrefix[2])) + switch (ppFromVEX3of3(insn->vexXopPrefix[2])) { default: break; @@ -448,7 +487,9 @@ static int readPrefixes(struct InternalInstruction* insn) { break; } - dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]); + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", + insn->vexXopPrefix[0], insn->vexXopPrefix[1], + insn->vexXopPrefix[2]); } } else if (byte == 0xc5) { @@ -460,22 +501,22 @@ static int readPrefixes(struct InternalInstruction* insn) { } if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { - insn->vexSize = 2; + insn->vexXopType = TYPE_VEX_2B; } else { unconsumeByte(insn); } - if (insn->vexSize == 2) { - insn->vexPrefix[0] = byte; - consumeByte(insn, &insn->vexPrefix[1]); + if (insn->vexXopType == TYPE_VEX_2B) { + insn->vexXopPrefix[0] = byte; + consumeByte(insn, &insn->vexXopPrefix[1]); if (insn->mode == MODE_64BIT) { insn->rexPrefix = 0x40 - | (rFromVEX2of2(insn->vexPrefix[1]) << 2); + | (rFromVEX2of2(insn->vexXopPrefix[1]) << 2); } - switch (ppFromVEX2of2(insn->vexPrefix[1])) + switch (ppFromVEX2of2(insn->vexXopPrefix[1])) { default: break; @@ -484,7 +525,53 @@ static int readPrefixes(struct InternalInstruction* insn) { break; } - dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]); + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexXopPrefix[0], insn->vexXopPrefix[1]); + } + } + else if (byte == 0x8f) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of XOP"); + return -1; + } + + if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */ + insn->vexXopType = TYPE_XOP; + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + + if (insn->vexXopType == TYPE_XOP) { + insn->vexXopPrefix[0] = byte; + consumeByte(insn, &insn->vexXopPrefix[1]); + consumeByte(insn, &insn->vexXopPrefix[2]); + + /* We simulate the REX prefix for simplicity's sake */ + + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (wFromXOP3of3(insn->vexXopPrefix[2]) << 3) + | (rFromXOP2of3(insn->vexXopPrefix[1]) << 2) + | (xFromXOP2of3(insn->vexXopPrefix[1]) << 1) + | (bFromXOP2of3(insn->vexXopPrefix[1]) << 0); + } + + switch (ppFromXOP3of3(insn->vexXopPrefix[2])) + { + default: + break; + case VEX_PREFIX_66: + hasOpSize = TRUE; + break; + } + + dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx", + insn->vexXopPrefix[0], insn->vexXopPrefix[1], + insn->vexXopPrefix[2]); } } else { @@ -559,37 +646,49 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = ONEBYTE; - if (insn->vexSize == 3) + if (insn->vexXopType == TYPE_VEX_3B) { - switch (mmmmmFromVEX2of3(insn->vexPrefix[1])) + switch (mmmmmFromVEX2of3(insn->vexXopPrefix[1])) { default: - dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1])); + dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", + mmmmmFromVEX2of3(insn->vexXopPrefix[1])); return -1; - case 0: - break; case VEX_LOB_0F: - insn->twoByteEscape = 0x0f; insn->opcodeType = TWOBYTE; return consumeByte(insn, &insn->opcode); case VEX_LOB_0F38: - insn->twoByteEscape = 0x0f; - insn->threeByteEscape = 0x38; insn->opcodeType = THREEBYTE_38; return consumeByte(insn, &insn->opcode); case VEX_LOB_0F3A: - insn->twoByteEscape = 0x0f; - insn->threeByteEscape = 0x3a; insn->opcodeType = THREEBYTE_3A; return consumeByte(insn, &insn->opcode); } } - else if (insn->vexSize == 2) + else if (insn->vexXopType == TYPE_VEX_2B) { - insn->twoByteEscape = 0x0f; insn->opcodeType = TWOBYTE; return consumeByte(insn, &insn->opcode); } + else if (insn->vexXopType == TYPE_XOP) + { + switch (mmmmmFromXOP2of3(insn->vexXopPrefix[1])) + { + default: + dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", + mmmmmFromVEX2of3(insn->vexXopPrefix[1])); + return -1; + case XOP_MAP_SELECT_8: + insn->opcodeType = XOP8_MAP; + return consumeByte(insn, &insn->opcode); + case XOP_MAP_SELECT_9: + insn->opcodeType = XOP9_MAP; + return consumeByte(insn, &insn->opcode); + case XOP_MAP_SELECT_A: + insn->opcodeType = XOPA_MAP; + return consumeByte(insn, &insn->opcode); + } + } if (consumeByte(insn, ¤t)) return -1; @@ -597,16 +696,12 @@ static int readOpcode(struct InternalInstruction* insn) { if (current == 0x0f) { dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); - insn->twoByteEscape = current; - if (consumeByte(insn, ¤t)) return -1; if (current == 0x38) { dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - insn->threeByteEscape = current; - if (consumeByte(insn, ¤t)) return -1; @@ -614,8 +709,6 @@ static int readOpcode(struct InternalInstruction* insn) { } else if (current == 0x3a) { dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - insn->threeByteEscape = current; - if (consumeByte(insn, ¤t)) return -1; @@ -623,8 +716,6 @@ static int readOpcode(struct InternalInstruction* insn) { } else if (current == 0xa6) { dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - insn->threeByteEscape = current; - if (consumeByte(insn, ¤t)) return -1; @@ -632,8 +723,6 @@ static int readOpcode(struct InternalInstruction* insn) { } else if (current == 0xa7) { dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - insn->threeByteEscape = current; - if (consumeByte(insn, ¤t)) return -1; @@ -747,11 +836,27 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (insn->mode == MODE_64BIT) attrMask |= ATTR_64BIT; - if (insn->vexSize) { + if (insn->vexXopType != TYPE_NO_VEX_XOP) { attrMask |= ATTR_VEX; - if (insn->vexSize == 3) { - switch (ppFromVEX3of3(insn->vexPrefix[2])) { + if (insn->vexXopType == TYPE_VEX_3B) { + switch (ppFromVEX3of3(insn->vexXopPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX3of3(insn->vexXopPrefix[2])) + attrMask |= ATTR_VEXL; + } + else if (insn->vexXopType == TYPE_VEX_2B) { + switch (ppFromVEX2of2(insn->vexXopPrefix[1])) { case VEX_PREFIX_66: attrMask |= ATTR_OPSIZE; break; @@ -763,11 +868,11 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { break; } - if (lFromVEX3of3(insn->vexPrefix[2])) + if (lFromVEX2of2(insn->vexXopPrefix[1])) attrMask |= ATTR_VEXL; } - else if (insn->vexSize == 2) { - switch (ppFromVEX2of2(insn->vexPrefix[1])) { + else if (insn->vexXopType == TYPE_XOP) { + switch (ppFromXOP3of3(insn->vexXopPrefix[2])) { case VEX_PREFIX_66: attrMask |= ATTR_OPSIZE; break; @@ -779,7 +884,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { break; } - if (lFromVEX2of2(insn->vexPrefix[1])) + if (lFromXOP3of3(insn->vexXopPrefix[2])) attrMask |= ATTR_VEXL; } else { @@ -805,42 +910,6 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { /* The following clauses compensate for limitations of the tables. */ - if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW) && - !(attrMask & ATTR_OPSIZE)) { - /* - * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit - * has precedence since there are no L-bit with W-bit entries in the tables. - * So if the L-bit isn't significant we should use the W-bit instead. - * We only need to do this if the instruction doesn't specify OpSize since - * there is a VEX_L_W_OPSIZE table. - */ - - const struct InstructionSpecifier *spec; - uint16_t instructionIDWithWBit; - const struct InstructionSpecifier *specWithWBit; - - spec = specifierForUID(instructionID); - - if (getIDWithAttrMask(&instructionIDWithWBit, - insn, - (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) { - insn->instructionID = instructionID; - insn->spec = spec; - return 0; - } - - specWithWBit = specifierForUID(instructionIDWithWBit); - - if (instructionID != instructionIDWithWBit) { - insn->instructionID = instructionIDWithWBit; - insn->spec = specWithWBit; - } else { - insn->instructionID = instructionID; - insn->spec = spec; - } - return 0; - } - if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) { /* * The instruction tables make no distinction between instructions that @@ -1234,6 +1303,8 @@ static int readModRM(struct InternalInstruction* insn) { return prefix##_EAX + index; \ case TYPE_R64: \ return prefix##_RAX + index; \ + case TYPE_XMM512: \ + return prefix##_ZMM0 + index; \ case TYPE_XMM256: \ return prefix##_YMM0 + index; \ case TYPE_XMM128: \ @@ -1479,10 +1550,12 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) { static int readVVVV(struct InternalInstruction* insn) { dbgprintf(insn, "readVVVV()"); - if (insn->vexSize == 3) - insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]); - else if (insn->vexSize == 2) - insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]); + if (insn->vexXopType == TYPE_VEX_3B) + insn->vvvv = vvvvFromVEX3of3(insn->vexXopPrefix[2]); + else if (insn->vexXopType == TYPE_VEX_2B) + insn->vvvv = vvvvFromVEX2of2(insn->vexXopPrefix[1]); + else if (insn->vexXopType == TYPE_XOP) + insn->vvvv = vvvvFromXOP3of3(insn->vexXopPrefix[2]); else return -1; diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 407ead3..6d03d5c 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -59,6 +59,15 @@ extern "C" { #define lFromVEX2of2(vex) (((vex) & 0x4) >> 2) #define ppFromVEX2of2(vex) ((vex) & 0x3) +#define rFromXOP2of3(xop) (((~(xop)) & 0x80) >> 7) +#define xFromXOP2of3(xop) (((~(xop)) & 0x40) >> 6) +#define bFromXOP2of3(xop) (((~(xop)) & 0x20) >> 5) +#define mmmmmFromXOP2of3(xop) ((xop) & 0x1f) +#define wFromXOP3of3(xop) (((xop) & 0x80) >> 7) +#define vvvvFromXOP3of3(vex) (((~(vex)) & 0x78) >> 3) +#define lFromXOP3of3(xop) (((xop) & 0x4) >> 2) +#define ppFromXOP3of3(xop) ((xop) & 0x3) + /* * These enums represent Intel registers for use by the decoder. */ @@ -219,7 +228,23 @@ extern "C" { ENTRY(XMM12) \ ENTRY(XMM13) \ ENTRY(XMM14) \ - ENTRY(XMM15) + ENTRY(XMM15) \ + ENTRY(XMM16) \ + ENTRY(XMM17) \ + ENTRY(XMM18) \ + ENTRY(XMM19) \ + ENTRY(XMM20) \ + ENTRY(XMM21) \ + ENTRY(XMM22) \ + ENTRY(XMM23) \ + ENTRY(XMM24) \ + ENTRY(XMM25) \ + ENTRY(XMM26) \ + ENTRY(XMM27) \ + ENTRY(XMM28) \ + ENTRY(XMM29) \ + ENTRY(XMM30) \ + ENTRY(XMM31) #define REGS_YMM \ ENTRY(YMM0) \ @@ -237,7 +262,57 @@ extern "C" { ENTRY(YMM12) \ ENTRY(YMM13) \ ENTRY(YMM14) \ - ENTRY(YMM15) + ENTRY(YMM15) \ + ENTRY(YMM16) \ + ENTRY(YMM17) \ + ENTRY(YMM18) \ + ENTRY(YMM19) \ + ENTRY(YMM20) \ + ENTRY(YMM21) \ + ENTRY(YMM22) \ + ENTRY(YMM23) \ + ENTRY(YMM24) \ + ENTRY(YMM25) \ + ENTRY(YMM26) \ + ENTRY(YMM27) \ + ENTRY(YMM28) \ + ENTRY(YMM29) \ + ENTRY(YMM30) \ + ENTRY(YMM31) + +#define REGS_ZMM \ + ENTRY(ZMM0) \ + ENTRY(ZMM1) \ + ENTRY(ZMM2) \ + ENTRY(ZMM3) \ + ENTRY(ZMM4) \ + ENTRY(ZMM5) \ + ENTRY(ZMM6) \ + ENTRY(ZMM7) \ + ENTRY(ZMM8) \ + ENTRY(ZMM9) \ + ENTRY(ZMM10) \ + ENTRY(ZMM11) \ + ENTRY(ZMM12) \ + ENTRY(ZMM13) \ + ENTRY(ZMM14) \ + ENTRY(ZMM15) \ + ENTRY(ZMM16) \ + ENTRY(ZMM17) \ + ENTRY(ZMM18) \ + ENTRY(ZMM19) \ + ENTRY(ZMM20) \ + ENTRY(ZMM21) \ + ENTRY(ZMM22) \ + ENTRY(ZMM23) \ + ENTRY(ZMM24) \ + ENTRY(ZMM25) \ + ENTRY(ZMM26) \ + ENTRY(ZMM27) \ + ENTRY(ZMM28) \ + ENTRY(ZMM29) \ + ENTRY(ZMM30) \ + ENTRY(ZMM31) #define REGS_SEGMENT \ ENTRY(ES) \ @@ -285,6 +360,7 @@ extern "C" { REGS_MMX \ REGS_XMM \ REGS_YMM \ + REGS_ZMM \ REGS_SEGMENT \ REGS_DEBUG \ REGS_CONTROL \ @@ -319,6 +395,7 @@ typedef enum { ALL_EA_BASES REGS_XMM REGS_YMM + REGS_ZMM #undef ENTRY SIB_INDEX_max } SIBIndex; @@ -379,6 +456,12 @@ typedef enum { VEX_LOB_0F3A = 0x3 } VEXLeadingOpcodeByte; +typedef enum { + XOP_MAP_SELECT_8 = 0x8, + XOP_MAP_SELECT_9 = 0x9, + XOP_MAP_SELECT_A = 0xA +} XOPMapSelect; + /* * VEXPrefixCode - Possible values for the VEX.pp field */ @@ -390,6 +473,13 @@ typedef enum { VEX_PREFIX_F2 = 0x3 } VEXPrefixCode; +typedef enum { + TYPE_NO_VEX_XOP = 0x0, + TYPE_VEX_2B = 0x1, + TYPE_VEX_3B = 0x2, + TYPE_XOP = 0x3 +} VEXXOPType; + typedef uint8_t BOOL; /* @@ -446,10 +536,10 @@ struct InternalInstruction { uint8_t prefixPresent[0x100]; /* contains the location (for use with the reader) of the prefix byte */ uint64_t prefixLocations[0x100]; - /* The value of the VEX prefix, if present */ - uint8_t vexPrefix[3]; + /* The value of the VEX/XOP prefix, if present */ + uint8_t vexXopPrefix[3]; /* The length of the VEX prefix (0 if not present) */ - uint8_t vexSize; + VEXXOPType vexXopType; /* The value of the REX prefix, if present */ uint8_t rexPrefix; /* The location where a mandatory prefix would have to be (i.e., right before @@ -457,6 +547,8 @@ struct InternalInstruction { uint64_t necessaryPrefixLocation; /* The segment override type */ SegmentOverride segmentOverride; + /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */ + BOOL xAcquireRelease; /* Sizes of various critical pieces of data, in bytes */ uint8_t registerSize; @@ -471,10 +563,6 @@ struct InternalInstruction { /* opcode state */ - /* The value of the two-byte escape prefix (usually 0x0f) */ - uint8_t twoByteEscape; - /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */ - uint8_t threeByteEscape; /* The last byte of the opcode, not counting any ModR/M extension */ uint8_t opcode; /* The ModR/M byte of the instruction, if it is an opcode extension */ diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 23dfe4b..dd1719c 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -32,6 +32,9 @@ #define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes #define THREEBYTEA6_SYM x86DisassemblerThreeByteA6Opcodes #define THREEBYTEA7_SYM x86DisassemblerThreeByteA7Opcodes +#define XOP8_MAP_SYM x86DisassemblerXOP8Opcodes +#define XOP9_MAP_SYM x86DisassemblerXOP9Opcodes +#define XOPA_MAP_SYM x86DisassemblerXOPAOpcodes #define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers" #define CONTEXTS_STR "x86DisassemblerContexts" @@ -41,6 +44,9 @@ #define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes" #define THREEBYTEA6_STR "x86DisassemblerThreeByteA6Opcodes" #define THREEBYTEA7_STR "x86DisassemblerThreeByteA7Opcodes" +#define XOP8_MAP_STR "x86DisassemblerXOP8Opcodes" +#define XOP9_MAP_STR "x86DisassemblerXOP9Opcodes" +#define XOPA_MAP_STR "x86DisassemblerXOPAOpcodes" /* * Attributes of an instruction that must be known before the opcode can be @@ -116,8 +122,154 @@ enum attributeBits { ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\ ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\ ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \ - ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") - + ENUM_ENTRY(IC_VEX_L_W, 4, "requires VEX, L and W") \ + ENUM_ENTRY(IC_VEX_L_W_XS, 5, "requires VEX, L, W and XS prefix") \ + ENUM_ENTRY(IC_VEX_L_W_XD, 5, "requires VEX, L, W and XD prefix") \ + ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \ + ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \ + ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \ + ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \ + ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_KZ_B, 1, "requires EVEX_B and EVEX_KZ prefix") \ + ENUM_ENTRY(IC_EVEX_XS_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_KZ, 1, "requires an EVEX_KZ prefix") \ + ENUM_ENTRY(IC_EVEX_XS_KZ, 2, "requires EVEX_KZ and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_KZ, 2, "requires EVEX_KZ and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_KZ, 2, "requires EVEX_KZ and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_KZ, 3, "requires EVEX_KZ and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_KZ, 4, "requires EVEX_KZ, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_KZ, 4, "requires EVEX_KZ, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ, 4, "requires EVEX_KZ, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_KZ, 3, "requires EVEX_KZ and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_KZ, 4, "requires EVEX_KZ and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_KZ, 4, "requires EVEX_KZ and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ, 4, "requires EVEX_KZ, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_KZ, 3, "requires EVEX_KZ, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_KZ, 4, "requires EVEX_KZ, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_KZ, 4, "requires EVEX_KZ, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_KZ, 3, "requires EVEX_KZ and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_KZ, 4, "requires EVEX_KZ and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_KZ, 4, "requires EVEX_KZ and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_KZ, 3, "requires EVEX_KZ, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ, 4, "requires EVEX_KZ, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ, 4, "requires EVEX_KZ, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") #define ENUM_ENTRY(n, r, d) n, typedef enum { @@ -136,7 +288,10 @@ typedef enum { THREEBYTE_38 = 2, THREEBYTE_3A = 3, THREEBYTE_A6 = 4, - THREEBYTE_A7 = 5 + THREEBYTE_A7 = 5, + XOP8_MAP = 6, + XOP9_MAP = 7, + XOPA_MAP = 8 } OpcodeType; /* @@ -224,6 +379,7 @@ struct ContextDecision { ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ + ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ ENUM_ENTRY(ENCODING_CW, "2-byte") \ ENUM_ENTRY(ENCODING_CD, "4-byte") \ @@ -321,6 +477,9 @@ struct ContextDecision { ENUM_ENTRY(TYPE_XMM64, "8-byte") \ ENUM_ENTRY(TYPE_XMM128, "16-byte") \ ENUM_ENTRY(TYPE_XMM256, "32-byte") \ + ENUM_ENTRY(TYPE_XMM512, "64-byte") \ + ENUM_ENTRY(TYPE_VK8, "8-bit") \ + ENUM_ENTRY(TYPE_VK16, "16-bit") \ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 8195b7e..4439311 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -50,7 +50,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, // Try to print any aliases first. if (!printAliasInstr(MI, OS)) printInstruction(MI, OS); - + // Next always print the annotation. printAnnotation(OS, Annot); @@ -158,10 +158,10 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, O << markup("<imm:") << '$' << formatImm((int64_t)Op.getImm()) << markup(">"); - + if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); - + } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); O << markup("<imm:") @@ -176,7 +176,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, const MCOperand &IndexReg = MI->getOperand(Op+2); const MCOperand &DispSpec = MI->getOperand(Op+3); const MCOperand &SegReg = MI->getOperand(Op+4); - + O << markup("<mem:"); // If this has a segment register, print it. @@ -184,7 +184,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, printOperand(MI, Op+4, O); O << ':'; } - + if (DispSpec.isImm()) { int64_t DispVal = DispSpec.getImm(); if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) @@ -193,21 +193,21 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); O << *DispSpec.getExpr(); } - + if (IndexReg.getReg() || BaseReg.getReg()) { O << '('; if (BaseReg.getReg()) printOperand(MI, Op, O); - + if (IndexReg.getReg()) { O << ','; printOperand(MI, Op+2, O); unsigned ScaleVal = MI->getOperand(Op+1).getImm(); if (ScaleVal != 1) { O << ',' - << markup("<imm:") + << markup("<imm:") << ScaleVal // never printed in hex. - << markup(">"); + << markup(">"); } } O << ')'; @@ -215,3 +215,19 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, O << markup(">"); } + +void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &DispSpec = MI->getOperand(Op); + + O << markup("<mem:"); + + if (DispSpec.isImm()) { + O << formatImm(DispSpec.getImm()); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement?"); + O << *DispSpec.getExpr(); + } + + O << markup(">"); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 8e09183..a8fab72 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -42,7 +42,8 @@ public: void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS); void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS); - + void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } @@ -65,6 +66,9 @@ public: void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } + void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } @@ -80,6 +84,22 @@ public: void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } + void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } }; } diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 2228e6c..e7e7b15 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -136,16 +136,11 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, } } -static void PrintRegName(raw_ostream &O, StringRef RegName) { - for (unsigned i = 0, e = RegName.size(); i != e; ++i) - O << (char)toupper(RegName[i]); -} - void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - PrintRegName(O, getRegisterName(Op.getReg())); + printRegName(O, Op.getReg()); } else if (Op.isImm()) { O << formatImm((int64_t)Op.getImm()); } else { @@ -205,3 +200,19 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, O << ']'; } + +void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &DispSpec = MI->getOperand(Op); + + O << '['; + + if (DispSpec.isImm()) { + O << formatImm(DispSpec.getImm()); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement?"); + O << *DispSpec.getExpr(); + } + + O << ']'; +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index bb769eb..590bf68 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -39,56 +39,82 @@ public: void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O); void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O); void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); - + void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "OPAQUE PTR "; + O << "opaque ptr "; printMemReference(MI, OpNo, O); } void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "BYTE PTR "; + O << "byte ptr "; printMemReference(MI, OpNo, O); } void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "WORD PTR "; + O << "word ptr "; printMemReference(MI, OpNo, O); } void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "DWORD PTR "; + O << "dword ptr "; printMemReference(MI, OpNo, O); } void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "QWORD PTR "; + O << "qword ptr "; printMemReference(MI, OpNo, O); } void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XMMWORD PTR "; + O << "xmmword ptr "; printMemReference(MI, OpNo, O); } void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "YMMWORD PTR "; + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; printMemReference(MI, OpNo, O); } void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "DWORD PTR "; + O << "dword ptr "; printMemReference(MI, OpNo, O); } void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "QWORD PTR "; + O << "qword ptr "; printMemReference(MI, OpNo, O); } void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XWORD PTR "; + O << "xword ptr "; printMemReference(MI, OpNo, O); } void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "XMMWORD PTR "; + O << "xmmword ptr "; printMemReference(MI, OpNo, O); } void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << "YMMWORD PTR "; + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; printMemReference(MI, OpNo, O); } + + void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printMemOffset(MI, OpNo, O); + } }; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index c995aad..f8e359b 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -9,6 +9,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCELFObjectWriter.h" @@ -19,10 +20,10 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" -#include "llvm/Object/MachOFormat.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -67,9 +68,16 @@ public: class X86AsmBackend : public MCAsmBackend { StringRef CPU; + bool HasNopl; public: X86AsmBackend(const Target &T, StringRef _CPU) - : MCAsmBackend(), CPU(_CPU) {} + : MCAsmBackend(), CPU(_CPU) { + HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && + CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && + CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && + CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && + CPU != "c3" && CPU != "c3-2"; + } unsigned getNumFixupKinds() const { return X86::NumTargetFixupKinds; @@ -309,11 +317,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { // This CPU doesnt support long nops. If needed add more. // FIXME: Can we get this from the subtarget somehow? // FIXME: We could generated something better than plain 0x90. - if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" || - CPU == "pentium" || CPU == "pentium-mmx" || CPU == "i686" || - CPU == "k6" || CPU == "k6-2" || CPU == "k6-3" || CPU == "geode" || - CPU == "winchip-c6" || CPU == "winchip2" || CPU == "c3" || - CPU == "c3-2") { + if (!HasNopl) { for (uint64_t i = 0; i < Count; ++i) OW->Write8(0x90); return true; @@ -338,6 +342,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { /* *** */ namespace { + class ELFX86AsmBackend : public X86AsmBackend { public: uint8_t OSABI; @@ -386,35 +391,368 @@ public: } }; +namespace CU { + + /// Compact unwind encoding values. + enum CompactUnwindEncodings { + /// [RE]BP based frame where [RE]BP is pused on the stack immediately after + /// the return address, then [RE]SP is moved to [RE]BP. + UNWIND_MODE_BP_FRAME = 0x01000000, + + /// A frameless function with a small constant stack size. + UNWIND_MODE_STACK_IMMD = 0x02000000, + + /// A frameless function with a large constant stack size. + UNWIND_MODE_STACK_IND = 0x03000000, + + /// No compact unwind encoding is available. + UNWIND_MODE_DWARF = 0x04000000, + + /// Mask for encoding the frame registers. + UNWIND_BP_FRAME_REGISTERS = 0x00007FFF, + + /// Mask for encoding the frameless registers. + UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF + }; + +} // end CU namespace + class DarwinX86AsmBackend : public X86AsmBackend { + const MCRegisterInfo &MRI; + + /// \brief Number of registers that can be saved in a compact unwind encoding. + enum { CU_NUM_SAVED_REGS = 6 }; + + mutable unsigned SavedRegs[CU_NUM_SAVED_REGS]; + bool Is64Bit; + + unsigned OffsetSize; ///< Offset of a "push" instruction. + unsigned PushInstrSize; ///< Size of a "push" instruction. + unsigned MoveInstrSize; ///< Size of a "move" instruction. + unsigned StackDivide; ///< Amount to adjust stack stize by. +protected: + /// \brief Implementation of algorithm to generate the compact unwind encoding + /// for the CFI instructions. + uint32_t + generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const { + if (Instrs.empty()) return 0; + + // Reset the saved registers. + unsigned SavedRegIdx = 0; + memset(SavedRegs, 0, sizeof(SavedRegs)); + + bool HasFP = false; + + // Encode that we are using EBP/RBP as the frame pointer. + uint32_t CompactUnwindEncoding = 0; + + unsigned SubtractInstrIdx = Is64Bit ? 3 : 2; + unsigned InstrOffset = 0; + unsigned StackAdjust = 0; + unsigned StackSize = 0; + unsigned PrevStackSize = 0; + unsigned NumDefCFAOffsets = 0; + + for (unsigned i = 0, e = Instrs.size(); i != e; ++i) { + const MCCFIInstruction &Inst = Instrs[i]; + + switch (Inst.getOperation()) { + default: + // Any other CFI directives indicate a frame that we aren't prepared + // to represent via compact unwind, so just bail out. + return 0; + case MCCFIInstruction::OpDefCfaRegister: { + // Defines a frame pointer. E.g. + // + // movq %rsp, %rbp + // L0: + // .cfi_def_cfa_register %rbp + // + HasFP = true; + assert(MRI.getLLVMRegNum(Inst.getRegister(), true) == + (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!"); + + // Reset the counts. + memset(SavedRegs, 0, sizeof(SavedRegs)); + StackAdjust = 0; + SavedRegIdx = 0; + InstrOffset += MoveInstrSize; + break; + } + case MCCFIInstruction::OpDefCfaOffset: { + // Defines a new offset for the CFA. E.g. + // + // With frame: + // + // pushq %rbp + // L0: + // .cfi_def_cfa_offset 16 + // + // Without frame: + // + // subq $72, %rsp + // L0: + // .cfi_def_cfa_offset 80 + // + PrevStackSize = StackSize; + StackSize = std::abs(Inst.getOffset()) / StackDivide; + ++NumDefCFAOffsets; + break; + } + case MCCFIInstruction::OpOffset: { + // Defines a "push" of a callee-saved register. E.g. + // + // pushq %r15 + // pushq %r14 + // pushq %rbx + // L0: + // subq $120, %rsp + // L1: + // .cfi_offset %rbx, -40 + // .cfi_offset %r14, -32 + // .cfi_offset %r15, -24 + // + if (SavedRegIdx == CU_NUM_SAVED_REGS) + // If there are too many saved registers, we cannot use a compact + // unwind encoding. + return CU::UNWIND_MODE_DWARF; + + unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + SavedRegs[SavedRegIdx++] = Reg; + StackAdjust += OffsetSize; + InstrOffset += PushInstrSize; + break; + } + } + } + + StackAdjust /= StackDivide; + + if (HasFP) { + if ((StackAdjust & 0xFF) != StackAdjust) + // Offset was too big for a compact unwind encoding. + return CU::UNWIND_MODE_DWARF; + + // Get the encoding of the saved registers when we have a frame pointer. + uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(); + if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; + + CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME; + CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16; + CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS; + } else { + // If the amount of the stack allocation is the size of a register, then + // we "push" the RAX/EAX register onto the stack instead of adjusting the + // stack pointer with a SUB instruction. We don't support the push of the + // RAX/EAX register with compact unwind. So we check for that situation + // here. + if ((NumDefCFAOffsets == SavedRegIdx + 1 && + StackSize - PrevStackSize == 1) || + (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2)) + return CU::UNWIND_MODE_DWARF; + + SubtractInstrIdx += InstrOffset; + ++StackAdjust; + + if ((StackSize & 0xFF) == StackSize) { + // Frameless stack with a small stack size. + CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD; + + // Encode the stack size. + CompactUnwindEncoding |= (StackSize & 0xFF) << 16; + } else { + if ((StackAdjust & 0x7) != StackAdjust) + // The extra stack adjustments are too big for us to handle. + return CU::UNWIND_MODE_DWARF; + + // Frameless stack with an offset too large for us to encode compactly. + CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND; + + // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP' + // instruction. + CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16; + + // Encode any extra stack stack adjustments (done via push + // instructions). + CompactUnwindEncoding |= (StackAdjust & 0x7) << 13; + } + + // Encode the number of registers saved. (Reverse the list first.) + std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]); + CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10; + + // Get the encoding of the saved registers when we don't have a frame + // pointer. + uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx); + if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; + + // Encode the register encoding. + CompactUnwindEncoding |= + RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION; + } + + return CompactUnwindEncoding; + } + +private: + /// \brief Get the compact unwind number for a given register. The number + /// corresponds to the enum lists in compact_unwind_encoding.h. + int getCompactUnwindRegNum(unsigned Reg) const { + static const uint16_t CU32BitRegs[7] = { + X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 + }; + static const uint16_t CU64BitRegs[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; + for (int Idx = 1; *CURegs; ++CURegs, ++Idx) + if (*CURegs == Reg) + return Idx; + + return -1; + } + + /// \brief Return the registers encoded for a compact encoding with a frame + /// pointer. + uint32_t encodeCompactUnwindRegistersWithFrame() const { + // Encode the registers in the order they were saved --- 3-bits per + // register. The list of saved registers is assumed to be in reverse + // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS. + uint32_t RegEnc = 0; + for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) { + unsigned Reg = SavedRegs[i]; + if (Reg == 0) break; + + int CURegNum = getCompactUnwindRegNum(Reg); + if (CURegNum == -1) return ~0U; + + // Encode the 3-bit register number in order, skipping over 3-bits for + // each register. + RegEnc |= (CURegNum & 0x7) << (Idx++ * 3); + } + + assert((RegEnc & 0x3FFFF) == RegEnc && + "Invalid compact register encoding!"); + return RegEnc; + } + + /// \brief Create the permutation encoding used with frameless stacks. It is + /// passed the number of registers to be saved and an array of the registers + /// saved. + uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const { + // The saved registers are numbered from 1 to 6. In order to encode the + // order in which they were saved, we re-number them according to their + // place in the register order. The re-numbering is relative to the last + // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in + // that order: + // + // Orig Re-Num + // ---- ------ + // 6 6 + // 2 2 + // 4 3 + // 5 3 + // + for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { + int CUReg = getCompactUnwindRegNum(SavedRegs[i]); + if (CUReg == -1) return ~0U; + SavedRegs[i] = CUReg; + } + + // Reverse the list. + std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]); + + uint32_t RenumRegs[CU_NUM_SAVED_REGS]; + for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){ + unsigned Countless = 0; + for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j) + if (SavedRegs[j] < SavedRegs[i]) + ++Countless; + + RenumRegs[i] = SavedRegs[i] - Countless - 1; + } + + // Take the renumbered values and encode them into a 10-bit number. + uint32_t permutationEncoding = 0; + switch (RegCount) { + case 6: + permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1] + + 6 * RenumRegs[2] + 2 * RenumRegs[3] + + RenumRegs[4]; + break; + case 5: + permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2] + + 6 * RenumRegs[3] + 2 * RenumRegs[4] + + RenumRegs[5]; + break; + case 4: + permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3] + + 3 * RenumRegs[4] + RenumRegs[5]; + break; + case 3: + permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4] + + RenumRegs[5]; + break; + case 2: + permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5]; + break; + case 1: + permutationEncoding |= RenumRegs[5]; + break; + } + + assert((permutationEncoding & 0x3FF) == permutationEncoding && + "Invalid compact register encoding!"); + return permutationEncoding; + } + public: - DarwinX86AsmBackend(const Target &T, StringRef CPU) - : X86AsmBackend(T, CPU) { } + DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU, + bool Is64Bit) + : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) { + memset(SavedRegs, 0, sizeof(SavedRegs)); + OffsetSize = Is64Bit ? 8 : 4; + MoveInstrSize = Is64Bit ? 3 : 2; + StackDivide = Is64Bit ? 8 : 4; + PushInstrSize = 1; + } }; class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { + bool SupportsCU; public: - DarwinX86_32AsmBackend(const Target &T, StringRef CPU) - : DarwinX86AsmBackend(T, CPU) {} + DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef CPU, bool SupportsCU) + : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {} MCObjectWriter *createObjectWriter(raw_ostream &OS) const { return createX86MachObjectWriter(OS, /*Is64Bit=*/false, - object::mach::CTM_i386, - object::mach::CSX86_ALL); + MachO::CPU_TYPE_I386, + MachO::CPU_SUBTYPE_I386_ALL); + } + + /// \brief Generate the compact unwind encoding for the CFI instructions. + virtual uint32_t + generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const { + return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0; } }; class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { + bool SupportsCU; + const MachO::CPUSubTypeX86 Subtype; public: - DarwinX86_64AsmBackend(const Target &T, StringRef CPU) - : DarwinX86AsmBackend(T, CPU) { + DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef CPU, bool SupportsCU, + MachO::CPUSubTypeX86 st) + : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU), + Subtype(st) { HasReliableSymbolDifference = true; } MCObjectWriter *createObjectWriter(raw_ostream &OS) const { return createX86MachObjectWriter(OS, /*Is64Bit=*/true, - object::mach::CTM_x86_64, - object::mach::CSX86_ALL); + MachO::CPU_TYPE_X86_64, Subtype); } virtual bool doesSectionRequireSymbols(const MCSection &Section) const { @@ -449,15 +787,26 @@ public: return false; } } + + /// \brief Generate the compact unwind encoding for the CFI instructions. + virtual uint32_t + generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const { + return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0; + } }; } // end anonymous namespace -MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) { +MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, + const MCRegisterInfo &MRI, + StringRef TT, + StringRef CPU) { Triple TheTriple(TT); if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) - return new DarwinX86_32AsmBackend(T, CPU); + return new DarwinX86_32AsmBackend(T, MRI, CPU, + TheTriple.isMacOSX() && + !TheTriple.isMacOSXVersionLT(10, 7)); if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) return new WindowsX86AsmBackend(T, false, CPU); @@ -466,11 +815,21 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, String return new ELFX86_32AsmBackend(T, OSABI, CPU); } -MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) { +MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, + const MCRegisterInfo &MRI, + StringRef TT, + StringRef CPU) { Triple TheTriple(TT); - if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) - return new DarwinX86_64AsmBackend(T, CPU); + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) { + MachO::CPUSubTypeX86 CS = + StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName()) + .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H) + .Default(MachO::CPU_SUBTYPE_X86_64_ALL); + return new DarwinX86_64AsmBackend(T, MRI, CPU, + TheTriple.isMacOSX() && + !TheTriple.isMacOSXVersionLT(10, 7), CS); + } if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) return new WindowsX86AsmBackend(T, true, CPU); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index d8f7278..1ef9814 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -354,6 +354,9 @@ namespace X86II { // XOP9 - Prefix to exclude use of imm byte. XOP9 = 21 << Op0Shift, + // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions. + XOPA = 22 << Op0Shift, + //===------------------------------------------------------------------===// // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. // They are used to specify GPRs and SSE registers, 64-bit operand size, @@ -462,20 +465,54 @@ namespace X86II { // prefix. Usually used for scalar instructions. Needed by disassembler. VEX_LIG = 1U << 6, + // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field + // with following encoding: + // - 00 V128 + // - 01 V256 + // - 10 V512 + // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros. + // this will save 1 tsflag bit + + // VEX_EVEX - Specifies that this instruction use EVEX form which provides + // syntax support up to 32 512-bit register operands and up to 7 16-bit + // mask operands as well as source operand data swizzling/memory operand + // conversion, eviction hint, and rounding mode. + EVEX = 1U << 7, + + // EVEX_K - Set if this instruction requires masking + EVEX_K = 1U << 8, + + // EVEX_Z - Set if this instruction has EVEX.Z field set. + EVEX_Z = 1U << 9, + + // EVEX_L2 - Set if this instruction has EVEX.L' field set. + EVEX_L2 = 1U << 10, + + // EVEX_B - Set if this instruction has EVEX.B field set. + EVEX_B = 1U << 11, + + // EVEX_CD8E - compressed disp8 form, element-size + EVEX_CD8EShift = VEXShift + 12, + EVEX_CD8EMask = 3, + + // EVEX_CD8V - compressed disp8 form, vector-width + EVEX_CD8VShift = EVEX_CD8EShift + 2, + EVEX_CD8VMask = 7, + /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction /// storing a classifier in the imm8 field. To simplify our implementation, /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. - Has3DNow0F0FOpcode = 1U << 7, + Has3DNow0F0FOpcode = 1U << 17, /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. - MemOp4 = 1U << 8, + MemOp4 = 1U << 18, /// XOP - Opcode prefix used by XOP instructions. - XOP = 1U << 9 + XOP = 1U << 19 }; @@ -533,12 +570,19 @@ namespace X86II { unsigned CurOp = 0; if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) ++CurOp; - else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) + // Special case for AVX-512 GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) // Special case for GATHER with 2 TIED_TO operands // Skip the first 2 operands: dst, mask_wb CurOp += 2; - } + else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) + // SCATTER + ++CurOp; return CurOp; } @@ -569,12 +613,15 @@ namespace X86II { case X86II::MRMSrcMem: { bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). if (HasMemOp4) ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). - + if (HasEVEX_K) + ++FirstMemOp;// Skip the mask register // FIXME: Maybe lea should have its own form? This is a horrible hack. //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || // Opcode == X86::LEA16r || Opcode == X86::LEA32r) @@ -611,6 +658,14 @@ namespace X86II { /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or /// higher) register? e.g. r8, xmm8, xmm13, etc. inline bool isX86_64ExtendedReg(unsigned RegNo) { + if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) || + (RegNo > X86::XMM23 && RegNo <= X86::XMM31) || + (RegNo > X86::YMM7 && RegNo <= X86::YMM15) || + (RegNo > X86::YMM23 && RegNo <= X86::YMM31) || + (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) || + (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31)) + return true; + switch (RegNo) { default: break; case X86::R8: case X86::R9: case X86::R10: case X86::R11: @@ -621,16 +676,21 @@ namespace X86II { case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W: case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B: case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: - case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: - case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: - case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: - case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: return true; } return false; } + + /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher) + /// registers? e.g. zmm21, etc. + static inline bool is32ExtendedReg(unsigned RegNo) { + return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) || + (RegNo > X86::YMM15 && RegNo <= X86::YMM31) || + (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31)); + } + inline bool isX86_64NonExtLowByteReg(unsigned reg) { return (reg == X86::SPL || reg == X86::BPL || diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index de80dd8..3ddd865 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -101,7 +101,27 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, } else { switch ((unsigned)Fixup.getKind()) { default: llvm_unreachable("invalid fixup kind!"); - case FK_Data_8: Type = ELF::R_X86_64_64; break; + case FK_Data_8: + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_X86_64_64; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_X86_64_GOT64; + break; + case MCSymbolRefExpr::VK_GOTOFF: + Type = ELF::R_X86_64_GOTOFF64; + break; + case MCSymbolRefExpr::VK_TPOFF: + Type = ELF::R_X86_64_TPOFF64; + break; + case MCSymbolRefExpr::VK_DTPOFF: + Type = ELF::R_X86_64_DTPOFF64; + break; + } + break; case X86::reloc_signed_4byte: switch (Modifier) { default: diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp new file mode 100644 index 0000000..a3eb4fb --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -0,0 +1,135 @@ +//===-- X86ELFRelocationInfo.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCRelocationInfo.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; +using namespace object; +using namespace ELF; + +namespace { +class X86_64ELFRelocationInfo : public MCRelocationInfo { +public: + X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} + + const MCExpr *createExprForRelocation(RelocationRef Rel) { + uint64_t RelType; Rel.getType(RelType); + symbol_iterator SymI = Rel.getSymbol(); + + StringRef SymName; SymI->getName(SymName); + uint64_t SymAddr; SymI->getAddress(SymAddr); + uint64_t SymSize; SymI->getSize(SymSize); + int64_t Addend; getELFRelocationAddend(Rel, Addend); + + MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); + // FIXME: check that the value is actually the same. + if (Sym->isVariable() == false) + Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); + + const MCExpr *Expr = 0; + // If hasAddend is true, then we need to add Addend (r_addend) to Expr. + bool hasAddend = false; + + // The AMD64 SysV ABI says: + // A: the addend used to compute the value of the relocatable field. + // B: the base address at which a shared object has been loaded into memory + // during execution. Generally, a shared object is built with a 0 base + // virtual address, but the execution address will be different. + // G: the offset into the global offset table at which the relocation + // entry's symbol will reside during execution. + // GOT: the address of the global offset table. + // L: the place (section offset or address) of the Procedure Linkage Table + // entry for a symbol. + // P: the place (section offset or address) of the storage unit being + // relocated (computed using r_offset). + // S: the value of the symbol whose index resides in the relocation entry. + // Z: the size of the symbol whose index resides in the relocation entry. + + switch(RelType) { + case R_X86_64_NONE: + case R_X86_64_COPY: + // none + break; + case R_X86_64_64: + case R_X86_64_16: + case R_X86_64_8: + // S + A + case R_X86_64_32: + case R_X86_64_32S: + // S + A (We don't care about the result not fitting in 32 bits.) + case R_X86_64_PC32: + case R_X86_64_PC16: + case R_X86_64_PC8: + case R_X86_64_PC64: + // S + A - P (P/pcrel is implicit) + hasAddend = true; + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + break; + case R_X86_64_GOT32: + case R_X86_64_GOT64: + case R_X86_64_GOTPC32: + case R_X86_64_GOTPC64: + case R_X86_64_GOTPLT64: + // G + A + hasAddend = true; + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Ctx); + break; + case R_X86_64_PLT32: + // L + A - P -> S@PLT + A + hasAddend = true; + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PLT, Ctx); + break; + case R_X86_64_GLOB_DAT: + case R_X86_64_JUMP_SLOT: + // S + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + break; + case R_X86_64_GOTPCREL: + case R_X86_64_GOTPCREL64: + // G + GOT + A - P -> S@GOTPCREL + A + hasAddend = true; + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx); + break; + case R_X86_64_GOTOFF64: + // S + A - GOT + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx); + break; + case R_X86_64_PLTOFF64: + // L + A - GOT + break; + case R_X86_64_SIZE32: + case R_X86_64_SIZE64: + // Z + A + Expr = MCConstantExpr::Create(SymSize, Ctx); + break; + default: + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + break; + } + if (Expr && hasAddend && Addend != 0) + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(Addend, Ctx), + Ctx); + return Expr; + } +}; +} // End unnamed namespace + +/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo. +MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) { + // We only handle x86-64 for now. + return new X86_64ELFRelocationInfo(Ctx); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 7815ae9..3861e1c 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -59,10 +59,8 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { // for .S files on other systems. Perhaps this is because the file system // wasn't always case preserving or something. CommentString = "##"; - PCSymbol = "."; SupportsDebugInformation = true; - DwarfUsesInlineInfoSection = true; UseDataRegionDirectives = MarkedJTDataRegions; // Exceptions handling @@ -92,8 +90,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { TextAlignFillValue = 0x90; PrivateGlobalPrefix = ".L"; - WeakRefDirective = "\t.weak\t"; - PCSymbol = "."; // Set up DWARF directives HasLEB128 = true; // Target asm supports leb128 directives (little-endian) @@ -139,6 +135,8 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; + + AllowAtInName = true; } void X86MCAsmInfoGNUCOFF::anchor() { } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index b6b70fd..80979dd 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -17,6 +17,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmInfoCOFF.h" #include "llvm/MC/MCAsmInfoDarwin.h" +#include "llvm/MC/MCAsmInfoELF.h" namespace llvm { class Triple; @@ -35,7 +36,7 @@ namespace llvm { MCStreamer &Streamer) const; }; - class X86ELFMCAsmInfo : public MCAsmInfo { + class X86ELFMCAsmInfo : public MCAsmInfoELF { virtual void anchor(); public: explicit X86ELFMCAsmInfo(const Triple &Triple); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 016af71..7952607 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -53,7 +53,7 @@ public: } unsigned GetX86RegNum(const MCOperand &MO) const { - return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7; + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7; } // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range @@ -77,6 +77,14 @@ public: return (~SrcRegNum) & 0xf; } + unsigned char getWriteMaskRegisterEncoding(const MCInst &MI, + unsigned OpNum) const { + assert(X86::K0 != MI.getOperand(OpNum).getReg() && + "Invalid mask register as write-mask!"); + unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum)); + return MaskRegNum; + } + void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const { OS << (char)C; ++CurByte; @@ -152,6 +160,52 @@ static bool isDisp8(int Value) { return Value == (signed char)Value; } +/// isCDisp8 - Return true if this signed displacement fits in a 8-bit +/// compressed dispacement field. +static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { + assert(((TSFlags >> X86II::VEXShift) & X86II::EVEX) && + "Compressed 8-bit displacement is only valid for EVEX inst."); + + unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask; + unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask; + + if (CD8V == 0 && CD8E == 0) { + CValue = Value; + return isDisp8(Value); + } + + unsigned MemObjSize = 1U << CD8E; + if (CD8V & 4) { + // Fixed vector length + MemObjSize *= 1U << (CD8V & 0x3); + } else { + // Modified vector length + bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B; + if (!EVEX_b) { + unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0; + EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0; + assert(EVEX_LL < 3 && ""); + + unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize; + NumElems /= 1U << (CD8V & 0x3); + + MemObjSize *= NumElems; + } + } + + unsigned MemObjMask = MemObjSize - 1; + assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size."); + + if (Value & MemObjMask) // Unaligned offset + return false; + Value /= MemObjSize; + bool Ret = (Value == (signed char)Value); + + if (Ret) + CValue = Value; + return Ret; +} + /// getImmFixupKind - Return the appropriate fixup kind to use for an immediate /// in an instruction with the specified TSFlags. static MCFixupKind getImmFixupKind(uint64_t TSFlags) { @@ -318,6 +372,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); unsigned BaseReg = Base.getReg(); + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; // Handle %rip relative addressing. if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode @@ -378,10 +433,21 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, } // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. - if (Disp.isImm() && isDisp8(Disp.getImm())) { - EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); - return; + if (Disp.isImm()) { + if (!HasEVEX && isDisp8(Disp.getImm())) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + return; + } + // Try EVEX compressed 8-bit displacement first; if failed, fall back to + // 32-bit displacement. + int CDisp8 = 0; + if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, + CDisp8 - Disp.getImm()); + return; + } } // Otherwise, emit the most general non-SIB encoding: [REG+disp32] @@ -397,6 +463,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, bool ForceDisp32 = false; bool ForceDisp8 = false; + int CDisp8 = 0; + int ImmOffset = 0; if (BaseReg == 0) { // If there is no base register, we emit the special case SIB byte with // MOD=0, BASE=5, to JUST get the index, scale, and displacement. @@ -412,10 +480,15 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); - } else if (isDisp8(Disp.getImm())) { + } else if (!HasEVEX && isDisp8(Disp.getImm())) { // Emit the disp8 encoding. EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + // Emit the disp8 encoding. + EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + ImmOffset = CDisp8 - Disp.getImm(); } else { // Emit the normal disp32 encoding. EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); @@ -445,7 +518,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // Do we need to output a displacement? if (ForceDisp8) - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset); else if (ForceDisp32 || Disp.getImm() != 0) EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, Fixups); @@ -457,6 +530,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const { + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; @@ -468,6 +543,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 0: Same as REX_R=1 (64 bit mode only) // unsigned char VEX_R = 0x1; + unsigned char EVEX_R2 = 0x1; // VEX_X: equivalent to REX.X, only used when a // register is used for index in SIB Byte. @@ -488,7 +564,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned char VEX_W = 0; // XOP: Use XOP prefix byte 0x8f instead of VEX. - unsigned char XOP = 0; + bool XOP = false; // VEX_5M (VEX m-mmmmm field): // @@ -498,12 +574,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 0b00011: implied 0F 3A leading opcode bytes // 0b00100-0b11111: Reserved for future use // 0b01000: XOP map select - 08h instructions with imm byte - // 0b10001: XOP map select - 09h instructions with no imm byte + // 0b01001: XOP map select - 09h instructions with no imm byte + // 0b01010: XOP map select - 0Ah instructions with imm dword unsigned char VEX_5M = 0x1; // VEX_4V (VEX vvvv field): a register specifier // (in 1's complement form) or 1111 if unused. unsigned char VEX_4V = 0xf; + unsigned char EVEX_V2 = 0x1; // VEX_L (Vector Length): // @@ -511,6 +589,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 1: 256-bit vector // unsigned char VEX_L = 0; + unsigned char EVEX_L2 = 0; // VEX_PP: opcode extension providing equivalent // functionality of a SIMD prefix @@ -522,6 +601,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // unsigned char VEX_PP = 0; + // EVEX_U + unsigned char EVEX_U = 1; // Always '1' so far + + // EVEX_z + unsigned char EVEX_z = 0; + + // EVEX_b + unsigned char EVEX_b = 0; + + // EVEX_aaa + unsigned char EVEX_aaa = 0; + // Encode the operand size opcode prefix as needed. if (TSFlags & X86II::OpSize) VEX_PP = 0x01; @@ -530,10 +621,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, VEX_W = 1; if ((TSFlags >> X86II::VEXShift) & X86II::XOP) - XOP = 1; + XOP = true; if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) VEX_L = 1; + if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2)) + EVEX_L2 = 1; + + if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z)) + EVEX_z = 1; + + if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_B)) + EVEX_b = 1; switch (TSFlags & X86II::Op0Mask) { default: llvm_unreachable("Invalid prefix!"); @@ -567,11 +666,11 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::XOP9: VEX_5M = 0x9; break; - case X86II::A6: // Bypass: Not used by VEX - case X86II::A7: // Bypass: Not used by VEX - case X86II::TB: // Bypass: Not used by VEX - case 0: - break; // No prefix! + case X86II::XOPA: + VEX_5M = 0xA; + break; + case X86II::TB: // VEX_5M/VEX_PP already correct + break; } @@ -580,12 +679,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned CurOp = 0; if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) ++CurOp; - else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) + // Special case for AVX-512 GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) // Special case for GATHER with 2 TIED_TO operands // Skip the first 2 operands: dst, mask_wb CurOp += 2; - } + else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) + // SCATTER + ++CurOp; switch (TSFlags & X86II::FormMask) { case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!"); @@ -595,18 +701,35 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // MemAddr, src1(VEX_4V), src2(ModR/M) // MemAddr, src1(ModR/M), imm8 // - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + X86::AddrBaseReg).getReg())) VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) VEX_X = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + EVEX_V2 = 0x0; + + CurOp += X86::AddrNumOperands; - CurOp = X86::AddrNumOperands; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } const MCOperand &MO = MI.getOperand(CurOp); - if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) - VEX_R = 0x0; + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MO.getReg())) + EVEX_R2 = 0x0; + } break; } case X86II::MRMSrcMem: @@ -619,11 +742,21 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // FMA4: // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); - if (HasVEX_4V) + if (HasVEX_4V) { VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -631,6 +764,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) VEX_X = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + EVEX_V2 = 0x0; if (HasVEX_4VOp3) // Instruction format for 4VOp3: @@ -647,8 +783,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // MRM[0-9]m instructions forms: // MemAddr // src1(VEX_4V), MemAddr - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, 0); + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -669,16 +812,27 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; CurOp++; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (HasMemOp4) // Skip second register source (encoded in I8IMM) CurOp++; if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; CurOp++; if (HasVEX_4VOp3) VEX_4V = getVEXRegisterEncoding(MI, CurOp); @@ -690,13 +844,24 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // dst(ModR/M), src1(VEX_4V), src2(ModR/M) if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; CurOp++; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; break; case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: @@ -704,9 +869,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::MRM6r: case X86II::MRM7r: // MRM0r-MRM7r instructions forms: // dst(VEX_4V), src(ModR/M), imm8 - VEX_4V = getVEXRegisterEncoding(MI, 0); - if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; + if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; break; default: // RawFrm break; @@ -715,29 +890,58 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // Emit segment override opcode prefix as needed. EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS); - // VEX opcode prefix can have 2 or 3 bytes - // - // 3 bytes: - // +-----+ +--------------+ +-------------------+ - // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | - // +-----+ +--------------+ +-------------------+ - // 2 bytes: - // +-----+ +-------------------+ - // | C5h | | R | vvvv | L | pp | - // +-----+ +-------------------+ - // - unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); + if (!HasEVEX) { + // VEX opcode prefix can have 2 or 3 bytes + // + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + // + unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); - if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix - EmitByte(0xC5, CurByte, OS); - EmitByte(LastByte | (VEX_R << 7), CurByte, OS); - return; - } + if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix + EmitByte(0xC5, CurByte, OS); + EmitByte(LastByte | (VEX_R << 7), CurByte, OS); + return; + } - // 3 byte VEX prefix - EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); - EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); - EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + // 3 byte VEX prefix + EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS); + EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); + EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + } else { + // EVEX opcode prefix can have 4 bytes + // + // +-----+ +--------------+ +-------------------+ +------------------------+ + // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa | + // +-----+ +--------------+ +-------------------+ +------------------------+ + assert((VEX_5M & 0x3) == VEX_5M + && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!"); + + VEX_5M &= 0x3; + + EmitByte(0x62, CurByte, OS); + EmitByte((VEX_R << 7) | + (VEX_X << 6) | + (VEX_B << 5) | + (EVEX_R2 << 4) | + VEX_5M, CurByte, OS); + EmitByte((VEX_W << 7) | + (VEX_4V << 3) | + (EVEX_U << 2) | + VEX_PP, CurByte, OS); + EmitByte((EVEX_z << 7) | + (EVEX_L2 << 6) | + (VEX_L << 5) | + (EVEX_b << 4) | + (EVEX_V2 << 3) | + EVEX_aaa, CurByte, OS); + } } /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 @@ -1007,6 +1211,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; const unsigned MemOp4_I8IMMOperand = 2; + // It uses the EVEX.aaa field? + bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX; + bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); + // Determine where the memory operand starts, if present. int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); if (MemoryOperand != -1) MemoryOperand += CurOp; @@ -1057,6 +1265,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + 1; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1069,6 +1280,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + X86::AddrNumOperands; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1082,6 +1296,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); SrcRegNum = CurOp + 1; + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; @@ -1100,6 +1317,12 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRMSrcMem: { int AddrOperands = X86::AddrNumOperands; unsigned FirstMemOp = CurOp+1; + + if (HasEVEX_K) { // Skip writemask + ++AddrOperands; + ++FirstMemOp; + } + if (HasVEX_4V) { ++AddrOperands; ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 5e84530..1cbdafd 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -263,7 +263,7 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) { return X; } -static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { +static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) { Triple TheTriple(TT); bool is64Bit = TheTriple.getArch() == Triple::x86_64; @@ -290,14 +290,16 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { int stackGrowth = is64Bit ? -8 : -4; // Initial state of the frame pointer is esp+stackGrowth. - MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth); - MAI->addInitialFrameState(0, Dst, Src); + unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP; + MCCFIInstruction Inst = MCCFIInstruction::createDefCfa( + 0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth); + MAI->addInitialFrameState(Inst); // Add return address to move list - MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth); - MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP); - MAI->addInitialFrameState(0, CSDst, CSSrc); + unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP; + MCCFIInstruction Inst2 = MCCFIInstruction::createOffset( + 0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth); + MAI->addInitialFrameState(Inst2); return MAI; } @@ -366,7 +368,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF) return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll); - return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack); + return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, RelaxAll, NoExecStack); } static MCInstPrinter *createX86MCInstPrinter(const Target &T, @@ -382,6 +384,17 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T, return 0; } +static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT, + MCContext &Ctx) { + Triple TheTriple(TT); + if (TheTriple.isEnvironmentMachO() && TheTriple.getArch() == Triple::x86_64) + return createX86_64MachORelocationInfo(Ctx); + else if (TheTriple.isOSBinFormatELF()) + return createX86_64ELFRelocationInfo(Ctx); + // Default to the stock relocation info. + return llvm::createMCRelocationInfo(TT, Ctx); +} + static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { return new MCInstrAnalysis(Info); } @@ -439,4 +452,10 @@ extern "C" void LLVMInitializeX86TargetMC() { createX86MCInstPrinter); TargetRegistry::RegisterMCInstPrinter(TheX86_64Target, createX86MCInstPrinter); + + // Register the MC relocation info. + TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target, + createX86MCRelocationInfo); + TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target, + createX86MCRelocationInfo); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 981aa1a..41ae435 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -25,6 +25,7 @@ class MCInstrInfo; class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; +class MCRelocationInfo; class Target; class StringRef; class raw_ostream; @@ -78,8 +79,10 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCSubtargetInfo &STI, MCContext &Ctx); -MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU); -MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU); +MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU); +MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU); /// createX86MachObjectWriter - Construct an X86 Mach-O object writer. MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, @@ -94,6 +97,12 @@ MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS, uint16_t EMachine); /// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer. MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit); + +/// createX86_64MachORelocationInfo - Construct X86-64 Mach-O relocation info. +MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); + +/// createX86_64ELFORelocationInfo - Construct X86-64 ELF relocation info. +MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp new file mode 100644 index 0000000..209b1d0e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -0,0 +1,116 @@ +//===-- X86MachORelocationInfo.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCRelocationInfo.h" +#include "llvm/Object/MachO.h" + +using namespace llvm; +using namespace object; +using namespace MachO; + +namespace { +class X86_64MachORelocationInfo : public MCRelocationInfo { +public: + X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} + + const MCExpr *createExprForRelocation(RelocationRef Rel) { + const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObjectFile()); + + uint64_t RelType; Rel.getType(RelType); + symbol_iterator SymI = Rel.getSymbol(); + + StringRef SymName; SymI->getName(SymName); + uint64_t SymAddr; SymI->getAddress(SymAddr); + + any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl()); + bool isPCRel = Obj->getAnyRelocationPCRel(RE); + + MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); + // FIXME: check that the value is actually the same. + if (Sym->isVariable() == false) + Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); + const MCExpr *Expr = 0; + + switch(RelType) { + case X86_64_RELOC_TLV: + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx); + break; + case X86_64_RELOC_SIGNED_4: + Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx), + MCConstantExpr::Create(4, Ctx), + Ctx); + break; + case X86_64_RELOC_SIGNED_2: + Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx), + MCConstantExpr::Create(2, Ctx), + Ctx); + break; + case X86_64_RELOC_SIGNED_1: + Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx), + MCConstantExpr::Create(1, Ctx), + Ctx); + break; + case X86_64_RELOC_GOT_LOAD: + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx); + break; + case X86_64_RELOC_GOT: + Expr = MCSymbolRefExpr::Create(Sym, isPCRel ? + MCSymbolRefExpr::VK_GOTPCREL : + MCSymbolRefExpr::VK_GOT, + Ctx); + break; + case X86_64_RELOC_SUBTRACTOR: + { + RelocationRef RelNext; + Obj->getRelocationNext(Rel.getRawDataRefImpl(), RelNext); + any_relocation_info RENext = Obj->getRelocation(RelNext.getRawDataRefImpl()); + + // X86_64_SUBTRACTOR must be followed by a relocation of type + // X86_64_RELOC_UNSIGNED. + // NOTE: Scattered relocations don't exist on x86_64. + unsigned RType = Obj->getAnyRelocationType(RENext); + if (RType != X86_64_RELOC_UNSIGNED) + report_fatal_error("Expected X86_64_RELOC_UNSIGNED after " + "X86_64_RELOC_SUBTRACTOR."); + + const MCExpr *LHS = MCSymbolRefExpr::Create(Sym, Ctx); + + symbol_iterator RSymI = RelNext.getSymbol(); + uint64_t RSymAddr; + RSymI->getAddress(RSymAddr); + StringRef RSymName; + RSymI->getName(RSymName); + + MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName); + if (RSym->isVariable() == false) + RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx)); + + const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx); + + Expr = MCBinaryExpr::CreateSub(LHS, RHS, Ctx); + break; + } + default: + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + break; + } + return Expr; + } +}; +} // End unnamed namespace + +/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo. +MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) { + return new X86_64MachORelocationInfo(Ctx); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 64f005c..eb7c0b1 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -16,12 +16,11 @@ #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCValue.h" -#include "llvm/Object/MachOFormat.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" +#include "llvm/Support/MachO.h" using namespace llvm; -using namespace llvm::object; namespace { class X86MachObjectWriter : public MCMachObjectTargetWriter { @@ -132,7 +131,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, if (Target.isAbsolute()) { // constant // SymbolNum of 0 indicates the absolute section. - Type = macho::RIT_X86_64_Unsigned; + Type = MachO::X86_64_RELOC_UNSIGNED; Index = 0; // FIXME: I believe this is broken, I don't think the linker can understand @@ -141,26 +140,31 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, // is to use an absolute symbol (which we don't support yet). if (IsPCRel) { IsExtern = 1; - Type = macho::RIT_X86_64_Branch; + Type = MachO::X86_64_RELOC_BRANCH; } } else if (Target.getSymB()) { // A - B + constant const MCSymbol *A = &Target.getSymA()->getSymbol(); + if (A->isTemporary()) + A = &A->AliasedSymbol(); MCSymbolData &A_SD = Asm.getSymbolData(*A); const MCSymbolData *A_Base = Asm.getAtom(&A_SD); const MCSymbol *B = &Target.getSymB()->getSymbol(); + if (B->isTemporary()) + B = &B->AliasedSymbol(); MCSymbolData &B_SD = Asm.getSymbolData(*B); const MCSymbolData *B_Base = Asm.getAtom(&B_SD); // Neither symbol can be modified. if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported relocation of modified symbol"); + report_fatal_error("unsupported relocation of modified symbol", false); // We don't support PCrel relocations of differences. Darwin 'as' doesn't // implement most of these correctly. if (IsPCRel) - report_fatal_error("unsupported pc-relative relocation of difference"); + report_fatal_error("unsupported pc-relative relocation of difference", + false); // The support for the situation where one or both of the symbols would // require a local relocation is handled just like if the symbols were @@ -173,7 +177,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, // single SIGNED relocation); reject it for now. Except the case where both // symbols don't have a base, equal but both NULL. if (A_Base == B_Base && A_Base) - report_fatal_error("unsupported relocation with identical base"); + report_fatal_error("unsupported relocation with identical base", false); + + // A subtraction expression where both symbols are undefined is a + // non-relocatable expression. + if (A->isUndefined() && B->isUndefined()) + report_fatal_error("unsupported relocation with subtraction expression", + false); Value += Writer->getSymbolAddress(&A_SD, Layout) - (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout)); @@ -188,15 +198,15 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, Index = A_SD.getFragment()->getParent()->getOrdinal() + 1; IsExtern = 0; } - Type = macho::RIT_X86_64_Unsigned; - - macho::RelocationEntry MRE; - MRE.Word0 = FixupOffset; - MRE.Word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); + Type = MachO::X86_64_RELOC_UNSIGNED; + + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); Writer->addRelocation(Fragment->getParent(), MRE); if (B_Base) { @@ -207,7 +217,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, Index = B_SD.getFragment()->getParent()->getOrdinal() + 1; IsExtern = 0; } - Type = macho::RIT_X86_64_Subtractor; + Type = MachO::X86_64_RELOC_SUBTRACTOR; } else { const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); MCSymbolData &SD = Asm.getSymbolData(*Symbol); @@ -252,11 +262,11 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, return; } else { report_fatal_error("unsupported relocation of variable '" + - Symbol->getName() + "'"); + Symbol->getName() + "'", false); } } else { report_fatal_error("unsupported relocation of undefined symbol '" + - Symbol->getName() + "'"); + Symbol->getName() + "'", false); } MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind(); @@ -267,15 +277,16 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, // rewrite the movq to an leaq at link time if the symbol ends up in // the same linkage unit. if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load) - Type = macho::RIT_X86_64_GOTLoad; + Type = MachO::X86_64_RELOC_GOT_LOAD; else - Type = macho::RIT_X86_64_GOT; + Type = MachO::X86_64_RELOC_GOT; } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { - Type = macho::RIT_X86_64_TLV; + Type = MachO::X86_64_RELOC_TLV; } else if (Modifier != MCSymbolRefExpr::VK_None) { - report_fatal_error("unsupported symbol modifier in relocation"); + report_fatal_error("unsupported symbol modifier in relocation", + false); } else { - Type = macho::RIT_X86_64_Signed; + Type = MachO::X86_64_RELOC_SIGNED; // The Darwin x86_64 relocation format has a problem where it cannot // encode an address (L<foo> + <constant>) which is outside the atom @@ -292,34 +303,40 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, // (the additional bias), but instead appear to just look at the final // offset. switch (-(Target.getConstant() + (1LL << Log2Size))) { - case 1: Type = macho::RIT_X86_64_Signed1; break; - case 2: Type = macho::RIT_X86_64_Signed2; break; - case 4: Type = macho::RIT_X86_64_Signed4; break; + case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break; + case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break; + case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break; } } } else { if (Modifier != MCSymbolRefExpr::VK_None) report_fatal_error("unsupported symbol modifier in branch " - "relocation"); + "relocation", false); - Type = macho::RIT_X86_64_Branch; + Type = MachO::X86_64_RELOC_BRANCH; } } else { if (Modifier == MCSymbolRefExpr::VK_GOT) { - Type = macho::RIT_X86_64_GOT; + Type = MachO::X86_64_RELOC_GOT; } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) { // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which // case all we do is set the PCrel bit in the relocation entry; this is // used with exception handling, for example. The source is required to // include any necessary offset directly. - Type = macho::RIT_X86_64_GOT; + Type = MachO::X86_64_RELOC_GOT; IsPCRel = 1; } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { - report_fatal_error("TLVP symbol modifier should have been rip-rel"); + report_fatal_error("TLVP symbol modifier should have been rip-rel", + false); } else if (Modifier != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported symbol modifier in relocation"); - else - Type = macho::RIT_X86_64_Unsigned; + report_fatal_error("unsupported symbol modifier in relocation", false); + else { + Type = MachO::X86_64_RELOC_UNSIGNED; + unsigned Kind = Fixup.getKind(); + if (Kind == X86::reloc_signed_4byte) + report_fatal_error("32-bit absolute addressing is not supported in " + "64-bit mode", false); + } } } @@ -327,13 +344,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, FixedValue = Value; // struct relocation_info (8 bytes) - macho::RelocationEntry MRE; - MRE.Word0 = FixupOffset; - MRE.Word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); Writer->addRelocation(Fragment->getParent(), MRE); } @@ -347,7 +364,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, uint64_t &FixedValue) { uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); - unsigned Type = macho::RIT_Vanilla; + unsigned Type = MachO::GENERIC_RELOC_VANILLA; // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); @@ -355,7 +372,8 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, if (!A_SD->getFragment()) report_fatal_error("symbol '" + A->getName() + - "' can not be undefined in a subtraction expression"); + "' can not be undefined in a subtraction expression", + false); uint32_t Value = Writer->getSymbolAddress(A_SD, Layout); uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent()); @@ -367,22 +385,23 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, if (!B_SD->getFragment()) report_fatal_error("symbol '" + B->getSymbol().getName() + - "' can not be undefined in a subtraction expression"); + "' can not be undefined in a subtraction expression", + false); // Select the appropriate difference relocation type. // // Note that there is no longer any semantic difference between these two // relocation types from the linkers point of view, this is done solely for // pedantic compatibility with 'as'. - Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference : - (unsigned)macho::RIT_Generic_LocalDifference; + Type = A_SD->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF : + (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF; Value2 = Writer->getSymbolAddress(B_SD, Layout); FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent()); } // Relocations are written out in reverse order, so the PAIR comes first. - if (Type == macho::RIT_Difference || - Type == macho::RIT_Generic_LocalDifference) { + if (Type == MachO::GENERIC_RELOC_SECTDIFF || + Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) { // If the offset is too large to fit in a scattered relocation, // we're hosed. It's an unfortunate limitation of the MachO format. if (FixupOffset > 0xffffff) { @@ -396,13 +415,13 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, llvm_unreachable("fatal error returned?!"); } - macho::RelocationEntry MRE; - MRE.Word0 = ((0 << 0) | - (macho::RIT_Pair << 24) | - (Log2Size << 28) | - (IsPCRel << 30) | - macho::RF_Scattered); - MRE.Word1 = Value2; + MachO::any_relocation_info MRE; + MRE.r_word0 = ((0 << 0) | // r_address + (MachO::GENERIC_RELOC_PAIR << 24) | // r_type + (Log2Size << 28) | + (IsPCRel << 30) | + MachO::R_SCATTERED); + MRE.r_word1 = Value2; Writer->addRelocation(Fragment->getParent(), MRE); } else { // If the offset is more than 24-bits, it won't fit in a scattered @@ -416,13 +435,13 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, return false; } - macho::RelocationEntry MRE; - MRE.Word0 = ((FixupOffset << 0) | - (Type << 24) | - (Log2Size << 28) | - (IsPCRel << 30) | - macho::RF_Scattered); - MRE.Word1 = Value; + MachO::any_relocation_info MRE; + MRE.r_word0 = ((FixupOffset << 0) | + (Type << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + MachO::R_SCATTERED); + MRE.r_word1 = Value; Writer->addRelocation(Fragment->getParent(), MRE); return true; } @@ -464,13 +483,13 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, } // struct relocation_info (8 bytes) - macho::RelocationEntry MRE; - MRE.Word0 = Value; - MRE.Word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (1 << 27) | // Extern - (macho::RIT_Generic_TLV << 28)); // Type + MachO::any_relocation_info MRE; + MRE.r_word0 = Value; + MRE.r_word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (1 << 27) | // r_extern + (MachO::GENERIC_RELOC_TLV << 28)); // r_type Writer->addRelocation(Fragment->getParent(), MRE); } @@ -530,7 +549,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // // FIXME: Currently, these are never generated (see code below). I cannot // find a case where they are actually emitted. - Type = macho::RIT_Vanilla; + Type = MachO::GENERIC_RELOC_VANILLA; } else { // Resolve constant variables. if (SD->getSymbol().isVariable()) { @@ -561,17 +580,17 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, if (IsPCRel) FixedValue -= Writer->getSectionAddress(Fragment->getParent()); - Type = macho::RIT_Vanilla; + Type = MachO::GENERIC_RELOC_VANILLA; } // struct relocation_info (8 bytes) - macho::RelocationEntry MRE; - MRE.Word0 = FixupOffset; - MRE.Word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); Writer->addRelocation(Fragment->getParent(), MRE); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index ed64a32..6da4142 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -27,7 +27,7 @@ namespace { public: X86WinCOFFObjectWriter(bool Is64Bit_); - ~X86WinCOFFObjectWriter(); + virtual ~X86WinCOFFObjectWriter(); virtual unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index c865500..65c5552 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -50,10 +50,10 @@ def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3", def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3", "Enable SSSE3 instructions", [FeatureSSE3]>; -def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41", +def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41", "Enable SSE 4.1 instructions", [FeatureSSSE3]>; -def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42", +def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", [FeatureSSE41]>; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", @@ -68,7 +68,7 @@ def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", "Support 64-bit instructions", [FeatureCMOV]>; -def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true", +def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", "64-bit with cmpxchg16b", [Feature64Bit]>; def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", @@ -86,6 +86,19 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", "Enable AVX2 instructions", [FeatureAVX]>; +def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F", + "Enable AVX-512 instructions", + [FeatureAVX2]>; +def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", + "Enable AVX-512 Exponential and Reciprocal Instructions", + [FeatureAVX512]>; +def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", + "Enable AVX-512 Conflict Detection Instructions", + [FeatureAVX512]>; +def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", + "Enable AVX-512 PreFetch Instructions", + [FeatureAVX512]>; + def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -104,12 +117,15 @@ def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", "Enable AES instructions", [FeatureSSE2]>; +def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true", + "Enable TBM instructions">; def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", "Support MOVBE instruction">; -def FeatureRDRAND : SubtargetFeature<"rdrand", "HasRDRAND", "true", +def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true", "Support RDRAND instruction">; def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", - "Support 16-bit floating point conversion instructions">; + "Support 16-bit floating point conversion instructions", + [FeatureAVX]>; def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true", "Support FS/GS Base instructions">; def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", @@ -124,6 +140,9 @@ def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true", "Support HLE">; def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", "Support ADX instructions">; +def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", + "Enable SHA instructions", + [FeatureSSE2]>; def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", @@ -150,6 +169,8 @@ include "X86Schedule.td" def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", "Intel Atom processors">; +def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", + "Intel Silvermont processors">; class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; @@ -193,6 +214,14 @@ def : ProcessorModel<"atom", AtomModel, FeatureLEAUsesAG, FeaturePadShortFunctions]>; +// Atom Silvermont. +def : ProcessorModel<"slm", SLMModel, [ProcIntelSLM, + FeatureSSE42, FeatureCMPXCHG16B, + FeatureMOVBE, FeaturePOPCNT, + FeaturePCLMUL, FeatureAES, + FeatureCallRegIndirect, + FeaturePRFCHW, + FeatureSlowBTMem]>; // "Arrandale" along with corei3 and corei5 def : ProcessorModel<"corei7", SandyBridgeModel, [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, @@ -227,6 +256,15 @@ def : ProcessorModel<"core-avx2", HaswellModel, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; +// KNL +// FIXME: define KNL model +def : ProcessorModel<"knl", HaswellModel, + [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, + FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, + FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; + def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; def : Proc<"k6-3", [Feature3DNow]>; @@ -254,21 +292,30 @@ def : Proc<"amdfam10", [FeatureSSE4A, FeaturePOPCNT, FeatureSlowBTMem]>; // Bobcat def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, - FeatureLZCNT, FeaturePOPCNT]>; + FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>; // Jaguar def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, - FeatureAES, FeaturePCLMUL, FeatureBMI, - FeatureF16C, FeatureMOVBE, FeatureLZCNT, - FeaturePOPCNT]>; + FeaturePRFCHW, FeatureAES, FeaturePCLMUL, + FeatureBMI, FeatureF16C, FeatureMOVBE, + FeatureLZCNT, FeaturePOPCNT]>; // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePCLMUL, + FeatureAES, FeaturePRFCHW, FeaturePCLMUL, FeatureLZCNT, FeaturePOPCNT]>; // Piledriver def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePCLMUL, + FeatureAES, FeaturePRFCHW, FeaturePCLMUL, + FeatureF16C, FeatureLZCNT, + FeaturePOPCNT, FeatureBMI, FeatureTBM, + FeatureFMA]>; + +// Steamroller +def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, + FeatureAES, FeaturePRFCHW, FeaturePCLMUL, FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureFMA]>; + FeaturePOPCNT, FeatureBMI, FeatureTBM, + FeatureFMA, FeatureFSGSBase]>; + def : Proc<"geode", [Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index 6b228b0..1258441 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -96,7 +96,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE) GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); else - GVSym = Mang->getSymbol(GV); + GVSym = getSymbol(GV); // Handle dllimport linkage. if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) @@ -109,21 +109,21 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym); if (StubSym.getPointer() == 0) StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){ MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); MachineModuleInfoImpl::StubValueTy &StubSym = MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym); if (StubSym.getPointer() == 0) StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub"); MachineModuleInfoImpl::StubValueTy &StubSym = MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); if (StubSym.getPointer() == 0) StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); } // If the name begins with a dollar-sign, enclose it in parens. We do this @@ -333,21 +333,21 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op, const MachineOperand &IndexReg = MI->getOperand(Op+2); const MachineOperand &DispSpec = MI->getOperand(Op+3); const MachineOperand &SegReg = MI->getOperand(Op+4); - + // If this has a segment register, print it. if (SegReg.getReg()) { printOperand(MI, Op+4, O, Modifier, AsmVariant); O << ':'; } - + O << '['; - + bool NeedPlus = false; if (BaseReg.getReg()) { printOperand(MI, Op, O, Modifier, AsmVariant); NeedPlus = true; } - + if (IndexReg.getReg()) { if (NeedPlus) O << " + "; if (ScaleVal != 1) @@ -394,6 +394,7 @@ bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, Reg = getX86SubSuperRegister(Reg, MVT::i32); break; case 'q': // Print DImode register + // FIXME: gcc will actually print e instead of r for 32-bit. Reg = getX86SubSuperRegister(Reg, MVT::i64); break; } @@ -518,6 +519,27 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { if (Subtarget->isTargetEnvMacho()) OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + + if (Subtarget->isTargetCOFF()) { + // Emit an absolute @feat.00 symbol. This appears to be some kind of + // compiler features bitfield read by link.exe. + if (!Subtarget->is64Bit()) { + MCSymbol *S = MMI->getContext().GetOrCreateSymbol(StringRef("@feat.00")); + OutStreamer.BeginCOFFSymbolDef(S); + OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); + OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL); + OutStreamer.EndCOFFSymbolDef(); + // According to the PE-COFF spec, the LSB of this value marks the object + // for "registered SEH". This means that all SEH handler entry points + // must be registered in .sxdata. Use of any unregistered handlers will + // cause the process to terminate immediately. LLVM does not know how to + // register any SEH handlers, so its object files should be safe. + S->setAbsolute(); + OutStreamer.EmitSymbolAttribute(S, MCSA_Global); + OutStreamer.EmitAssignment( + S, MCConstantExpr::Create(int64_t(1), MMI->getContext())); + } + } } @@ -606,6 +628,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer.AddBlankLine(); } + SM.serializeToStackMapSection(); + // Funny Darwin hack: This flag tells the linker that no global symbols // contain code that falls through to other global symbols (e.g. the obvious // implementation of multiple entry points). If this doesn't occur, the @@ -645,12 +669,12 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) if (I->hasDLLExportLinkage()) - DLLExportedFns.push_back(Mang->getSymbol(I)); + DLLExportedFns.push_back(getSymbol(I)); for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) if (I->hasDLLExportLinkage()) - DLLExportedGlobals.push_back(Mang->getSymbol(I)); + DLLExportedGlobals.push_back(getSymbol(I)); // Output linker support code for dllexported globals on windows. if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) { @@ -702,48 +726,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } } -MachineLocation -X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - - if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); - } - return Location; -} - -void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, - raw_ostream &O) { - // Only the target-dependent form of DBG_VALUE should get here. - // Referencing the offset and metadata as NOps-2 and NOps-1 is - // probably portable to other targets; frame pointer location is not. - unsigned NOps = MI->getNumOperands(); - assert(NOps==7); - O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); - if (V.getContext().isSubprogram()) - O << DISubprogram(V.getContext()).getDisplayName() << ":"; - O << V.getName(); - O << " <- "; - // Frame address. Currently handles register +- offset only. - O << '['; - if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg()) - printOperand(MI, 0, O); - else - O << "undef"; - O << '+'; printOperand(MI, 3, O); - O << ']'; - O << "+"; - printOperand(MI, NOps-2, O); -} - - - //===----------------------------------------------------------------------===// // Target Registry Stuff //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index bc7496b..24a768b 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -16,6 +16,7 @@ #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/Support/Compiler.h" namespace llvm { @@ -24,9 +25,21 @@ class MCStreamer; class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { const X86Subtarget *Subtarget; + StackMaps SM; + + // Parses operands of PATCHPOINT and STACKMAP to produce stack map Location + // structures. Returns a result location and an iterator to the operand + // immediately following the operands consumed. + // + // This method is implemented in X86MCInstLower.cpp. + static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator> + stackmapOperandParser(MachineInstr::const_mop_iterator MOI, + MachineInstr::const_mop_iterator MOE, + const TargetMachine &TM); + public: explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer) { + : AsmPrinter(TM, Streamer), SM(*this, stackmapOperandParser) { Subtarget = &TM.getSubtarget<X86Subtarget>(); } @@ -67,11 +80,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { unsigned AsmVariant = 1); virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE; - - void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - - virtual MachineLocation - getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h new file mode 100644 index 0000000..e76f9fd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -0,0 +1,35 @@ +//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the X86 Calling Convention that +// aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef X86CALLINGCONV_H +#define X86CALLINGCONV_H + +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/IR/CallingConv.h" + +namespace llvm { + +inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, + CCValAssign::LocInfo &, ISD::ArgFlagsTy &, + CCState &) { + llvm_unreachable("The AnyReg calling convention is only supported by the " \ + "stackmap and patchpoint intrinsics."); + // gracefully fallback to X86 C calling convention on Release builds. + return false; +} + +} // End llvm namespace + +#endif + diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index 40c5d91..a78b5c0 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -49,6 +49,12 @@ def RetCC_X86Common : CallingConv<[ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX-512 target feature. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + // MMX vector types are always returned in MM0. If the target doesn't have // MM0, it doesn't support these vector types. CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, @@ -99,6 +105,10 @@ def RetCC_Intel_OCL_BI : CallingConv<[ CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + // 512-bit FP vectors + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + // i32, i64 in the standard way CCDelegateTo<RetCC_X86Common> ]>; @@ -141,6 +151,26 @@ def RetCC_X86_64_HiPE : CallingConv<[ CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>> ]>; +// X86-64 WebKit_JS return-value convention. +def RetCC_X86_64_WebKit_JS : CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: RAX + CCIfType<[i64], CCAssignToReg<[RAX]>> +]>; + +// X86-64 AnyReg return-value convention. No explicit register is specified for +// the return-value. The register allocator is allowed and expected to choose +// any free register. +// +// This calling convention is currently only supported by the stackmap and +// patchpoint intrinsics. All other uses will result in an assert on Debug +// builds. On Release builds we fallback to the X86 C calling convention. +def RetCC_X86_64_AnyReg : CallingConv<[ + CCCustom<"CC_X86_AnyReg_Error"> +]>; + // This is the root return-value convention for the X86-32 backend. def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. @@ -157,6 +187,10 @@ def RetCC_X86_64 : CallingConv<[ // HiPE uses RetCC_X86_64_HiPE CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>, + // Handle JavaScript calls. + CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>, + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>, + // Handle explicit CC selection CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, @@ -213,10 +247,15 @@ def CC_X86_64_C : CallingConv<[ // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasAVX()", + CCIfSubtarget<"hasFp256()", CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]>>>>, + // The first 8 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>, + // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, @@ -230,7 +269,11 @@ def CC_X86_64_C : CallingConv<[ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToStack<32, 32>> + CCAssignToStack<32, 32>>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> ]>; // Calling convention used on Win64 @@ -251,16 +294,19 @@ def CC_X86_Win64_C : CallingConv<[ // 256 bit vectors are passed by pointer CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, + // 512 bit vectors are passed by pointer + CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>, + // The first 4 MMX vector arguments are passed in GPRs. CCIfType<[x86mmx], CCBitConvertToType<i64>>, // The first 4 integer arguments are passed in integer registers. CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ], [XMM0, XMM1, XMM2, XMM3]>>, - + // Do not pass the sret argument in RCX, the Win64 thiscall calling - // convention requires "this" to be passed in RCX. - CCIfCC<"CallingConv::X86_ThisCall", + // convention requires "this" to be passed in RCX. + CCIfCC<"CallingConv::X86_ThisCall", CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ], [XMM1, XMM2, XMM3]>>>>, @@ -307,6 +353,25 @@ def CC_X86_64_HiPE : CallingConv<[ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> ]>; +def CC_X86_64_WebKit_JS : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Integer/FP values are always stored in stack slots that are 8 bytes in size + // and 8-byte aligned. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> +]>; + +// No explicit register is specified for the AnyReg calling convention. The +// register allocator may assign the arguments to any free register. +// +// This calling convention is currently only supported by the stackmap and +// patchpoint intrinsics. All other uses will result in an assert on Debug +// builds. On Release builds we fallback to the X86 C calling convention. +def CC_X86_64_AnyReg : CallingConv<[ + CCCustom<"CC_X86_AnyReg_Error"> +]>; + //===----------------------------------------------------------------------===// // X86 C Calling Convention //===----------------------------------------------------------------------===// @@ -332,7 +397,7 @@ def CC_X86_32_Common : CallingConv<[ // Integer/Float values get stored in stack slots that are 4 bytes in // size and 4-byte aligned. CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - + // Doubles get 8-byte slots that are 4-byte aligned. CCIfType<[f64], CCAssignToStack<8, 4>>, @@ -345,7 +410,7 @@ def CC_X86_32_Common : CallingConv<[ // The first 4 AVX 256-bit vector arguments are passed in YMM registers. CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasAVX()", + CCIfSubtarget<"hasFp256()", CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. @@ -469,6 +534,10 @@ def CC_Intel_OCL_BI : CallingConv<[ CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + // The 512-bit vector arguments are passed in ZMM registers. + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, CCDelegateTo<CC_X86_32_C> @@ -494,6 +563,8 @@ def CC_X86_32 : CallingConv<[ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>, + CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>, + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>, CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, @@ -532,9 +603,13 @@ def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10, // Standard C + YMM6-15 def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, - R13, R14, R15, + R13, R14, R15, (sequence "YMM%u", 6, 15))>; +def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, + R12, R13, R14, R15, + (sequence "ZMM%u", 6, 21), + K4, K5, K6, K7)>; //Standard C + XMM 8-15 def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, (sequence "XMM%u", 8, 15))>; @@ -542,3 +617,7 @@ def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, //Standard C + YMM 8-15 def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, (sequence "YMM%u", 8, 15))>; + +def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add CSR_64, + (sequence "ZMM%u", 16, 31), + K4, K5, K6, K7)>; diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp index 8fea6ed..14385ed 100644 --- a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp @@ -53,13 +53,8 @@ namespace { static char ID; explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce) : MachineFunctionPass(ID), II(0), TD(0), TM(tm), - MCE(mce), PICBaseOffset(0), Is64BitMode(false), - IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} - Emitter(X86TargetMachine &tm, CodeEmitter &mce, - const X86InstrInfo &ii, const DataLayout &td, bool is64) - : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), - MCE(mce), PICBaseOffset(0), Is64BitMode(is64), - IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} + MCE(mce), PICBaseOffset(0), Is64BitMode(false), + IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} bool runOnMachineFunction(MachineFunction &MF); @@ -845,7 +840,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned char VEX_W = 0; // XOP: Use XOP prefix byte 0x8f instead of VEX. - unsigned char XOP = 0; + bool XOP = false; // VEX_5M (VEX m-mmmmm field): // @@ -855,7 +850,8 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, // 0b00011: implied 0F 3A leading opcode bytes // 0b00100-0b11111: Reserved for future use // 0b01000: XOP map select - 08h instructions with imm byte - // 0b10001: XOP map select - 09h instructions with no imm byte + // 0b01001: XOP map select - 09h instructions with no imm byte + // 0b01010: XOP map select - 0Ah instructions with imm dword unsigned char VEX_5M = 0x1; // VEX_4V (VEX vvvv field): a register specifier @@ -887,7 +883,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, VEX_W = 1; if ((TSFlags >> X86II::VEXShift) & X86II::XOP) - XOP = 1; + XOP = true; if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) VEX_L = 1; @@ -924,11 +920,11 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, case X86II::XOP9: VEX_5M = 0x9; break; - case X86II::A6: // Bypass: Not used by VEX - case X86II::A7: // Bypass: Not used by VEX - case X86II::TB: // Bypass: Not used by VEX - case 0: - break; // No prefix! + case X86II::XOPA: + VEX_5M = 0xA; + break; + case X86II::TB: // VEX_5M/VEX_PP already correct + break; } @@ -987,11 +983,14 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, // FMA4: // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), - if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_R = 0x0; + CurOp++; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, 1); + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + CurOp++; + } if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -1001,7 +1000,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, VEX_X = 0x0; if (HasVEX_4VOp3) - VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1); + VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands); break; case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: @@ -1011,7 +1010,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, // MemAddr // src1(VEX_4V), MemAddr if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, 0); + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); if (X86II::isX86_64ExtendedReg( MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) @@ -1064,8 +1063,10 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, case X86II::MRM6r: case X86II::MRM7r: // MRM0r-MRM7r instructions forms: // dst(VEX_4V), src(ModR/M), imm8 - VEX_4V = getVEXRegisterEncoding(MI, 0); - if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + CurOp++; + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) VEX_B = 0x0; break; default: // RawFrm @@ -1270,7 +1271,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (Opcode == X86::MOV64ri64i32) + if (Opcode == X86::MOV32ri64) rt = X86::reloc_absolute_word; // FIXME: add X86II flag? // This should not occur on Darwin for relocatable objects. if (Opcode == X86::MOV64ri) diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index 725d3fa..97f96ab 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "X86.h" +#include "X86CallingConv.h" #include "X86ISelLowering.h" #include "X86InstrBuilder.h" #include "X86RegisterInfo.h" @@ -45,10 +46,6 @@ class X86FastISel : public FastISel { /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// RegInfo - X86 register info. - /// - const X86RegisterInfo *RegInfo; - /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 /// floating point ops. /// When SSE is available, use it for f32 operations. @@ -63,7 +60,6 @@ public: Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); } virtual bool TargetSelectInstruction(const Instruction *I); @@ -84,8 +80,10 @@ private: bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); - bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM); - bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, + bool Aligned = false); + bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM, + bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -128,6 +126,8 @@ private: return static_cast<const X86TargetMachine *>(&TM); } + bool handleConstantAddresses(const Value *V, X86AddressMode &AM); + unsigned TargetMaterializeConstant(const Constant *C); unsigned TargetMaterializeAlloca(const AllocaInst *C); @@ -238,7 +238,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { +X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, + const X86AddressMode &AM, bool Aligned) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -248,8 +249,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1); - Val = AndResult; + TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1); + ValReg = AndResult; } // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; @@ -265,26 +266,35 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; break; case MVT::v4f32: - Opc = X86::MOVAPSmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: - Opc = X86::MOVAPDmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: - Opc = X86::MOVDQAmr; + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; break; } addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DL, TII.get(Opc)), AM).addReg(Val); + DL, TII.get(Opc)), AM).addReg(ValReg); return true; } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM) { + const X86AddressMode &AM, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa<ConstantPointerNull>(Val)) Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext())); @@ -319,7 +329,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, if (ValReg == 0) return false; - return X86FastEmitStore(VT, ValReg, AM); + return X86FastEmitStore(VT, ValReg, AM, Aligned); } /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of @@ -337,9 +347,126 @@ bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, return true; } +bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // Can't handle TLS yet. + if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) + if (GVar->isThreadLocal()) + return false; + + // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how + // it works...). + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + if (const GlobalVariable *GVar = + dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false))) + if (GVar->isThreadLocal()) + return false; + + // RIP-relative addresses can't have additional register operands, so if + // we've already folded stuff into the addressing mode, just force the + // global value into its own register, which we can use as the basereg. + if (!Subtarget->isPICStyleRIPRel() || + (AM.Base.Reg == 0 && AM.IndexReg == 0)) { + // Okay, we've committed to selecting this global. Set up the address. + AM.GV = GV; + + // Allow the subtarget to classify the global. + unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + // If this reference is relative to the pic base, set it now. + if (isGlobalRelativeToPICBase(GVFlags)) { + // FIXME: How do we know Base.Reg is free?? + AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } + + // Unless the ABI requires an extra load, return a direct reference to + // the global. + if (!isGlobalStubReference(GVFlags)) { + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + AM.GVOpFlags = GVFlags; + return true; + } + + // Ok, we need to do a load from a stub. If we've already loaded from + // this stub, reuse the loaded pointer, otherwise emit the load now. + DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); + unsigned LoadReg; + if (I != LocalValueMap.end() && I->second != 0) { + LoadReg = I->second; + } else { + // Issue load from stub. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + X86AddressMode StubAM; + StubAM.Base.Reg = AM.Base.Reg; + StubAM.GV = GV; + StubAM.GVOpFlags = GVFlags; + + // Prepare for inserting code in the local-value area. + SavePoint SaveInsertPt = enterLocalValueArea(); + + if (TLI.getPointerTy() == MVT::i64) { + Opc = X86::MOV64rm; + RC = &X86::GR64RegClass; + + if (Subtarget->isPICStyleRIPRel()) + StubAM.Base.Reg = X86::RIP; + } else { + Opc = X86::MOV32rm; + RC = &X86::GR32RegClass; + } + + LoadReg = createResultReg(RC); + MachineInstrBuilder LoadMI = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); + addFullAddress(LoadMI, StubAM); + + // Ok, back to normal mode. + leaveLocalValueArea(SaveInsertPt); + + // Prevent loading GV stub multiple times in same MBB. + LocalValueMap[V] = LoadReg; + } + + // Now construct the final address. Note that the Disp, Scale, + // and Index values may already be set here. + AM.Base.Reg = LoadReg; + AM.GV = 0; + return true; + } + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + /// X86SelectAddress - Attempt to fill in an address from the given value. /// bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { + SmallVector<const Value *, 32> GEPs; +redo_gep: const User *U = NULL; unsigned Opcode = Instruction::UserOp1; if (const Instruction *I = dyn_cast<Instruction>(V)) { @@ -434,13 +561,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { Disp += CI->getSExtValue() * S; break; } - if (isa<AddOperator>(Op) && - (!isa<Instruction>(Op) || - FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] - == FuncInfo.MBB) && - isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { - // An add (in the same block) with a constant operand. Fold the - // constant. + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. ConstantInt *CI = cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); Disp += CI->getSExtValue() * S; @@ -462,139 +584,43 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { goto unsupported_gep; } } + // Check for displacement overflow. if (!isInt<32>(Disp)) break; - // Ok, the GEP indices were covered by constant-offset and scaled-index - // addressing. Update the address state and move on to examining the base. + AM.IndexReg = IndexReg; AM.Scale = Scale; AM.Disp = (uint32_t)Disp; - if (X86SelectAddress(U->getOperand(0), AM)) + GEPs.push_back(V); + + if (const GetElementPtrInst *GEP = + dyn_cast<GetElementPtrInst>(U->getOperand(0))) { + // Ok, the GEP indices were covered by constant-offset and scaled-index + // addressing. Update the address state and move on to examining the base. + V = GEP; + goto redo_gep; + } else if (X86SelectAddress(U->getOperand(0), AM)) { return true; + } // If we couldn't merge the gep value into this addr mode, revert back to // our address and just match the value instead of completely failing. AM = SavedAM; - break; - unsupported_gep: - // Ok, the GEP indices weren't all covered. - break; - } - } - - // Handle constant address. - if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { - // Can't handle alternate code models yet. - if (TM.getCodeModel() != CodeModel::Small) - return false; - - // Can't handle TLS yet. - if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) - if (GVar->isThreadLocal()) - return false; - // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how - // it works...). - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - if (const GlobalVariable *GVar = - dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false))) - if (GVar->isThreadLocal()) - return false; - - // RIP-relative addresses can't have additional register operands, so if - // we've already folded stuff into the addressing mode, just force the - // global value into its own register, which we can use as the basereg. - if (!Subtarget->isPICStyleRIPRel() || - (AM.Base.Reg == 0 && AM.IndexReg == 0)) { - // Okay, we've committed to selecting this global. Set up the address. - AM.GV = GV; - - // Allow the subtarget to classify the global. - unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); - - // If this reference is relative to the pic base, set it now. - if (isGlobalRelativeToPICBase(GVFlags)) { - // FIXME: How do we know Base.Reg is free?? - AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); - } - - // Unless the ABI requires an extra load, return a direct reference to - // the global. - if (!isGlobalStubReference(GVFlags)) { - if (Subtarget->isPICStyleRIPRel()) { - // Use rip-relative addressing if we can. Above we verified that the - // base and index registers are unused. - assert(AM.Base.Reg == 0 && AM.IndexReg == 0); - AM.Base.Reg = X86::RIP; - } - AM.GVOpFlags = GVFlags; + for (SmallVectorImpl<const Value *>::reverse_iterator + I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I) + if (handleConstantAddresses(*I, AM)) return true; - } - - // Ok, we need to do a load from a stub. If we've already loaded from - // this stub, reuse the loaded pointer, otherwise emit the load now. - DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); - unsigned LoadReg; - if (I != LocalValueMap.end() && I->second != 0) { - LoadReg = I->second; - } else { - // Issue load from stub. - unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; - X86AddressMode StubAM; - StubAM.Base.Reg = AM.Base.Reg; - StubAM.GV = GV; - StubAM.GVOpFlags = GVFlags; - - // Prepare for inserting code in the local-value area. - SavePoint SaveInsertPt = enterLocalValueArea(); - - if (TLI.getPointerTy() == MVT::i64) { - Opc = X86::MOV64rm; - RC = &X86::GR64RegClass; - - if (Subtarget->isPICStyleRIPRel()) - StubAM.Base.Reg = X86::RIP; - } else { - Opc = X86::MOV32rm; - RC = &X86::GR32RegClass; - } - - LoadReg = createResultReg(RC); - MachineInstrBuilder LoadMI = - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); - addFullAddress(LoadMI, StubAM); - // Ok, back to normal mode. - leaveLocalValueArea(SaveInsertPt); - - // Prevent loading GV stub multiple times in same MBB. - LocalValueMap[V] = LoadReg; - } - - // Now construct the final address. Note that the Disp, Scale, - // and Index values may already be set here. - AM.Base.Reg = LoadReg; - AM.GV = 0; - return true; - } + return false; + unsupported_gep: + // Ok, the GEP indices weren't all covered. + break; } - - // If all else fails, try to materialize the value in a register. - if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { - if (AM.Base.Reg == 0) { - AM.Base.Reg = getRegForValue(V); - return AM.Base.Reg != 0; - } - if (AM.IndexReg == 0) { - assert(AM.Scale == 1 && "Scale with no index!"); - AM.IndexReg = getRegForValue(V); - return AM.IndexReg != 0; - } } - return false; + return handleConstantAddresses(V, AM); } /// X86SelectCallAddress - Attempt to fill in an address from the given value. @@ -602,9 +628,35 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { const User *U = NULL; unsigned Opcode = Instruction::UserOp1; - if (const Instruction *I = dyn_cast<Instruction>(V)) { + const Instruction *I = dyn_cast<Instruction>(V); + // Record if the value is defined in the same basic block. + // + // This information is crucial to know whether or not folding an + // operand is valid. + // Indeed, FastISel generates or reuses a virtual register for all + // operands of all instructions it selects. Obviously, the definition and + // its uses must use the same virtual register otherwise the produced + // code is incorrect. + // Before instruction selection, FunctionLoweringInfo::set sets the virtual + // registers for values that are alive across basic blocks. This ensures + // that the values are consistently set between across basic block, even + // if different instruction selection mechanisms are used (e.g., a mix of + // SDISel and FastISel). + // For values local to a basic block, the instruction selection process + // generates these virtual registers with whatever method is appropriate + // for its needs. In particular, FastISel and SDISel do not share the way + // local virtual registers are set. + // Therefore, this is impossible (or at least unsafe) to share values + // between basic blocks unless they use the same instruction selection + // method, which is not guarantee for X86. + // Moreover, things like hasOneUse could not be used accurately, if we + // allow to reference values across basic blocks whereas they are not + // alive across basic blocks initially. + bool InMBB = true; + if (I) { Opcode = I->getOpcode(); U = I; + InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { Opcode = C->getOpcode(); U = C; @@ -613,18 +665,22 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { switch (Opcode) { default: break; case Instruction::BitCast: - // Look past bitcasts. - return X86SelectCallAddress(U->getOperand(0), AM); + // Look past bitcasts if its operand is in the same BB. + if (InMBB) + return X86SelectCallAddress(U->getOperand(0), AM); + break; case Instruction::IntToPtr: - // Look past no-op inttoptrs. - if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + // Look past no-op inttoptrs if its operand is in the same BB. + if (InMBB && + TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) return X86SelectCallAddress(U->getOperand(0), AM); break; case Instruction::PtrToInt: - // Look past no-op ptrtoints. - if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + // Look past no-op ptrtoints if its operand is in the same BB. + if (InMBB && + TLI.getValueType(U->getType()) == TLI.getPointerTy()) return X86SelectCallAddress(U->getOperand(0), AM); break; } @@ -693,6 +749,10 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (S->isAtomic()) return false; + unsigned SABIAlignment = + TD.getABITypeAlignment(S->getValueOperand()->getType()); + bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment; + MVT VT; if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true)) return false; @@ -701,7 +761,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (!X86SelectAddress(I->getOperand(1), AM)) return false; - return X86FastEmitStore(VT, I->getOperand(0), AM); + return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned); } /// X86SelectRet - Select and emit code to implement ret instructions. @@ -1006,10 +1066,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { } bool X86FastISel::X86SelectZExt(const Instruction *I) { - // Handle zero-extension from i1 to i8, which is common. - if (!I->getOperand(0)->getType()->isIntegerTy(1)) - return false; - EVT DstVT = TLI.getValueType(I->getType()); if (!TLI.isTypeLegal(DstVT)) return false; @@ -1018,12 +1074,37 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { if (ResultReg == 0) return false; - // Set the high bits to zero. - ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); - if (ResultReg == 0) - return false; + // Handle zero-extension from i1 to i8, which is common. + MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType()); + if (SrcVT.SimpleTy == MVT::i1) { + // Set the high bits to zero. + ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + + if (ResultReg == 0) + return false; + } + + if (DstVT == MVT::i64) { + // Handle extension to 64-bits via sub-register shenanigans. + unsigned MovInst; + + switch (SrcVT.SimpleTy) { + case MVT::i8: MovInst = X86::MOVZX32rr8; break; + case MVT::i16: MovInst = X86::MOVZX32rr16; break; + case MVT::i32: MovInst = X86::MOV32rr; break; + default: llvm_unreachable("Unexpected zext to i64 source type"); + } + + unsigned Result32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovInst), Result32) + .addReg(ResultReg); - if (DstVT != MVT::i8) { + ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::SUBREG_TO_REG), + ResultReg) + .addImm(0).addReg(Result32).addImm(X86::sub_32bit); + } else if (DstVT != MVT::i8) { ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, ResultReg, /*Kill=*/true); if (ResultReg == 0) @@ -1274,8 +1355,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { { &X86::GR16RegClass, X86::AX, X86::DX, { { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem - { X86::DIV16r, X86::MOV16r0, Copy, X86::AX, U }, // UDiv - { X86::DIV16r, X86::MOV16r0, Copy, X86::DX, U }, // URem + { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv + { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem } }, // i16 { &X86::GR32RegClass, X86::EAX, X86::EDX, { @@ -1288,8 +1369,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { { &X86::GR64RegClass, X86::RAX, X86::RDX, { { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem - { X86::DIV64r, X86::MOV64r0, Copy, X86::RAX, U }, // UDiv - { X86::DIV64r, X86::MOV64r0, Copy, X86::RDX, U }, // URem + { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv + { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem } }, // i64 }; @@ -1335,17 +1416,63 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { if (OpEntry.IsOpSigned) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpEntry.OpSignExtend)); - else + else { + unsigned Zero32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(OpEntry.OpSignExtend), TypeEntry.HighInReg); + TII.get(X86::MOV32r0), Zero32); + + // Copy the zero into the appropriate sub/super/identical physical + // register. Unfortunately the operations needed are not uniform enough to + // fit neatly into the table above. + if (VT.SimpleTy == MVT::i16) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Copy), TypeEntry.HighInReg) + .addReg(Zero32, 0, X86::sub_16bit); + } else if (VT.SimpleTy == MVT::i32) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Copy), TypeEntry.HighInReg) + .addReg(Zero32); + } else if (VT.SimpleTy == MVT::i64) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) + .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); + } + } } // Generate the DIV/IDIV instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); - // Copy output register into result register. - unsigned ResultReg = createResultReg(TypeEntry.RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(Copy), ResultReg).addReg(OpEntry.DivRemResultReg); + // For i8 remainder, we can't reference AH directly, as we'll end + // up with bogus copies like %R9B = COPY %AH. Reference AX + // instead to prevent AH references in a REX instruction. + // + // The current assumption of the fast register allocator is that isel + // won't generate explicit references to the GPR8_NOREX registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. + unsigned ResultReg = 0; + if ((I->getOpcode() == Instruction::SRem || + I->getOpcode() == Instruction::URem) && + OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { + unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); + unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Copy), SourceSuperReg).addReg(X86::AX); + + // Shift AX right by 8 bits instead of using AH. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SHR16ri), + ResultSuperReg).addReg(SourceSuperReg).addImm(8); + + // Now reference the 8-bit subreg of the result. + ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, + /*Kill=*/true, X86::sub_8bit); + } + // Copy the result out of the physreg if we haven't already. + if (!ResultReg) { + ResultReg = createResultReg(TypeEntry.RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Copy), ResultReg) + .addReg(OpEntry.DivRemResultReg); + } UpdateValueMap(I, ResultReg); return true; @@ -1698,8 +1825,6 @@ bool X86FastISel::FastLowerArguments() { const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I, ++Idx) { - if (I->use_empty()) - continue; bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32; const TargetRegisterClass *RC = is32Bit ? RC32 : RC64; unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx]; @@ -1988,6 +2113,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { } else { unsigned LocMemOffset = VA.getLocMemOffset(); X86AddressMode AM; + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>( + getTargetMachine()->getRegisterInfo()); AM.Base.Reg = RegInfo->getStackRegister(); AM.Disp = LocMemOffset; const Value *ArgVal = ArgVals[VA.getValNo()]; diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index e8c88a3..38a8351 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -144,8 +144,8 @@ FunctionPass *llvm::createX86FixupLEAs() { bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; - TII = Func.getTarget().getInstrInfo(); TM = &MF->getTarget(); + TII = TM->getInstrInfo(); DEBUG(dbgs() << "Start X86FixupLEAs\n";); // Process all basic blocks. diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index 0585b43..48470da 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -115,9 +115,10 @@ namespace { unsigned Mask = 0; for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), E = MBB->livein_end(); I != E; ++I) { - unsigned Reg = *I - X86::FP0; - if (Reg < 8) - Mask |= 1 << Reg; + unsigned Reg = *I; + if (Reg < X86::FP0 || Reg > X86::FP6) + continue; + Mask |= 1 << (Reg - X86::FP0); } return Mask; } @@ -893,8 +894,8 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Produce implicit-defs for free by using killed registers. while (Kills && Defs) { - unsigned KReg = CountTrailingZeros_32(Kills); - unsigned DReg = CountTrailingZeros_32(Defs); + unsigned KReg = countTrailingZeros(Kills); + unsigned DReg = countTrailingZeros(Defs); DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n"); std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]); std::swap(RegMap[KReg], RegMap[DReg]); @@ -917,7 +918,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Manually kill the rest. while (Kills) { - unsigned KReg = CountTrailingZeros_32(Kills); + unsigned KReg = countTrailingZeros(Kills); DEBUG(dbgs() << "Killing %FP" << KReg << "\n"); freeStackSlotBefore(I, KReg); Kills &= ~(1 << KReg); @@ -925,7 +926,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Load zeros for all the imp-defs. while(Defs) { - unsigned DReg = CountTrailingZeros_32(Defs); + unsigned DReg = countTrailingZeros(Defs); DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n"); BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0)); pushReg(DReg); @@ -1636,7 +1637,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // Note: this might be a non-optimal pop sequence. We might be able to do // better by trying to pop in stack order or something. while (FPKills) { - unsigned FPReg = CountTrailingZeros_32(FPKills); + unsigned FPReg = countTrailingZeros(FPKills); if (isLive(FPReg)) freeStackSlotAfter(InsertPt, FPReg); FPKills &= ~(1U << FPReg); @@ -1661,6 +1662,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) .addExternalSymbol("_ftol2") .addReg(X86::ST0, RegState::ImplicitKill) + .addReg(X86::ECX, RegState::ImplicitDefine) .addReg(X86::EAX, RegState::Define | RegState::Implicit) .addReg(X86::EDX, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 42b4e73..a06ba9d 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -307,12 +307,12 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, unsigned FramePtr) const { MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); if (CSI.empty()) return; - std::vector<MachineMove> &Moves = MMI.getFrameMoves(); const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); bool HasFP = hasFP(MF); @@ -360,276 +360,18 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, if (HasFP && FramePtr == Reg) continue; - MachineLocation CSDst(MachineLocation::VirtualFP, Offset); - MachineLocation CSSrc(Reg); - Moves.push_back(MachineMove(Label, CSDst, CSSrc)); + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + MMI.addFrameInst(MCCFIInstruction::createOffset(Label, DwarfReg, Offset)); } } -/// getCompactUnwindRegNum - Get the compact unwind number for a given -/// register. The number corresponds to the enum lists in -/// compact_unwind_encoding.h. -static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) { - static const uint16_t CU32BitRegs[] = { - X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 - }; - static const uint16_t CU64BitRegs[] = { - X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 - }; - const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs; - for (int Idx = 1; *CURegs; ++CURegs, ++Idx) - if (*CURegs == Reg) - return Idx; - - return -1; -} - -// Number of registers that can be saved in a compact unwind encoding. -#define CU_NUM_SAVED_REGS 6 - -/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding -/// used with frameless stacks. It is passed the number of registers to be saved -/// and an array of the registers saved. -static uint32_t -encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], - unsigned RegCount, bool Is64Bit) { - // The saved registers are numbered from 1 to 6. In order to encode the order - // in which they were saved, we re-number them according to their place in the - // register order. The re-numbering is relative to the last re-numbered - // register. E.g., if we have registers {6, 2, 4, 5} saved in that order: - // - // Orig Re-Num - // ---- ------ - // 6 6 - // 2 2 - // 4 3 - // 5 3 - // - for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { - int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit); - if (CUReg == -1) return ~0U; - SavedRegs[i] = CUReg; - } - - // Reverse the list. - std::swap(SavedRegs[0], SavedRegs[5]); - std::swap(SavedRegs[1], SavedRegs[4]); - std::swap(SavedRegs[2], SavedRegs[3]); - - uint32_t RenumRegs[CU_NUM_SAVED_REGS]; - for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) { - unsigned Countless = 0; - for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j) - if (SavedRegs[j] < SavedRegs[i]) - ++Countless; - - RenumRegs[i] = SavedRegs[i] - Countless - 1; - } - - // Take the renumbered values and encode them into a 10-bit number. - uint32_t permutationEncoding = 0; - switch (RegCount) { - case 6: - permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1] - + 6 * RenumRegs[2] + 2 * RenumRegs[3] - + RenumRegs[4]; - break; - case 5: - permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2] - + 6 * RenumRegs[3] + 2 * RenumRegs[4] - + RenumRegs[5]; - break; - case 4: - permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3] - + 3 * RenumRegs[4] + RenumRegs[5]; - break; - case 3: - permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4] - + RenumRegs[5]; - break; - case 2: - permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5]; - break; - case 1: - permutationEncoding |= RenumRegs[5]; - break; - } - - assert((permutationEncoding & 0x3FF) == permutationEncoding && - "Invalid compact register encoding!"); - return permutationEncoding; -} - -/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a -/// compact encoding with a frame pointer. -static uint32_t -encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], - bool Is64Bit) { - // Encode the registers in the order they were saved, 3-bits per register. The - // registers are numbered from 1 to CU_NUM_SAVED_REGS. - uint32_t RegEnc = 0; - for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) { - unsigned Reg = SavedRegs[I]; - if (Reg == 0) continue; - - int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit); - if (CURegNum == -1) return ~0U; - - // Encode the 3-bit register number in order, skipping over 3-bits for each - // register. - RegEnc |= (CURegNum & 0x7) << (Idx++ * 3); - } - - assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!"); - return RegEnc; -} - -uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - unsigned FramePtr = RegInfo->getFrameRegister(MF); - unsigned StackPtr = RegInfo->getStackRegister(); - - bool Is64Bit = STI.is64Bit(); - bool HasFP = hasFP(MF); - - unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 }; - unsigned SavedRegIdx = 0; - - unsigned OffsetSize = (Is64Bit ? 8 : 4); - - unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r); - unsigned PushInstrSize = 1; - unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); - unsigned MoveInstrSize = (Is64Bit ? 3 : 2); - unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2); - - unsigned StackDivide = (Is64Bit ? 8 : 4); - - unsigned InstrOffset = 0; - unsigned StackAdjust = 0; - unsigned StackSize = 0; - - MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB. - bool ExpectEnd = false; - for (MachineBasicBlock::iterator - MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) { - MachineInstr &MI = *MBBI; - unsigned Opc = MI.getOpcode(); - if (Opc == X86::PROLOG_LABEL) continue; - if (!MI.getFlag(MachineInstr::FrameSetup)) break; - - // We don't exect any more prolog instructions. - if (ExpectEnd) return CU::UNWIND_MODE_DWARF; - - if (Opc == PushInstr) { - // If there are too many saved registers, we cannot use compact encoding. - if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF; - - unsigned Reg = MI.getOperand(0).getReg(); - if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) { - ExpectEnd = true; - continue; - } - - SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg(); - StackAdjust += OffsetSize; - InstrOffset += PushInstrSize; - } else if (Opc == MoveInstr) { - unsigned SrcReg = MI.getOperand(1).getReg(); - unsigned DstReg = MI.getOperand(0).getReg(); - - if (DstReg != FramePtr || SrcReg != StackPtr) - return CU::UNWIND_MODE_DWARF; - - StackAdjust = 0; - memset(SavedRegs, 0, sizeof(SavedRegs)); - SavedRegIdx = 0; - InstrOffset += MoveInstrSize; - } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || - Opc == X86::SUB32ri || Opc == X86::SUB32ri8) { - if (StackSize) - // We already have a stack size. - return CU::UNWIND_MODE_DWARF; - - if (!MI.getOperand(0).isReg() || - MI.getOperand(0).getReg() != MI.getOperand(1).getReg() || - MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm()) - // We need this to be a stack adjustment pointer. Something like: - // - // %RSP<def> = SUB64ri8 %RSP, 48 - return CU::UNWIND_MODE_DWARF; - - StackSize = MI.getOperand(2).getImm() / StackDivide; - SubtractInstrIdx += InstrOffset; - ExpectEnd = true; - } - } - - // Encode that we are using EBP/RBP as the frame pointer. - uint32_t CompactUnwindEncoding = 0; - StackAdjust /= StackDivide; - if (HasFP) { - if ((StackAdjust & 0xFF) != StackAdjust) - // Offset was too big for compact encoding. - return CU::UNWIND_MODE_DWARF; - - // Get the encoding of the saved registers when we have a frame pointer. - uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit); - if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; - - CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME; - CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16; - CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS; - } else { - ++StackAdjust; - uint32_t TotalStackSize = StackAdjust + StackSize; - if ((TotalStackSize & 0xFF) == TotalStackSize) { - // Frameless stack with a small stack size. - CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD; - - // Encode the stack size. - CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16; - } else { - if ((StackAdjust & 0x7) != StackAdjust) - // The extra stack adjustments are too big for us to handle. - return CU::UNWIND_MODE_DWARF; - - // Frameless stack with an offset too large for us to encode compactly. - CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND; - - // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP' - // instruction. - CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16; - - // Encode any extra stack stack adjustments (done via push instructions). - CompactUnwindEncoding |= (StackAdjust & 0x7) << 13; - } - - // Encode the number of registers saved. - CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10; - - // Get the encoding of the saved registers when we don't have a frame - // pointer. - uint32_t RegEnc = - encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx, - Is64Bit); - if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; - - // Encode the register encoding. - CompactUnwindEncoding |= - RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION; - } - - return CompactUnwindEncoding; -} - /// usesTheStack - This function checks if any of the users of EFLAGS /// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has /// to use the stack, and if we don't adjust the stack we clobber the first /// frame index. /// See X86InstrInfo::copyPhysReg. -static bool usesTheStack(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); +static bool usesTheStack(const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS), re = MRI.reg_end(); ri != re; ++ri) @@ -732,7 +474,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // REG < 64 => DW_CFA_offset + Reg // ELSE => DW_CFA_offset_extended - std::vector<MachineMove> &Moves = MMI.getFrameMoves(); uint64_t NumBytes = 0; int stackGrowth = -SlotSize; @@ -765,20 +506,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addSym(FrameLabel); // Define the current CFA rule to use the provided offset. - if (StackSize) { - MachineLocation SPDst(MachineLocation::VirtualFP); - MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth); - Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); - } else { - MachineLocation SPDst(StackPtr); - MachineLocation SPSrc(StackPtr, stackGrowth); - Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); - } + assert(StackSize); + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(FrameLabel, 2 * stackGrowth)); // Change the rule for the FramePtr to be an "offset" rule. - MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth); - MachineLocation FPSrc(FramePtr); - Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc)); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + MMI.addFrameInst(MCCFIInstruction::createOffset(FrameLabel, DwarfFramePtr, + 2 * stackGrowth)); } // Update EBP with the new base value. @@ -794,9 +529,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addSym(FrameLabel); // Define the current CFA to use the EBP/RBP register. - MachineLocation FPDst(FramePtr); - MachineLocation FPSrc(MachineLocation::VirtualFP); - Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc)); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + MMI.addFrameInst( + MCCFIInstruction::createDefCfaRegister(FrameLabel, DwarfFramePtr)); } // Mark the FramePtr as live-in in every block except the entry. @@ -824,10 +559,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label); // Define the current CFA rule to use the provided offset. - unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr; - MachineLocation SPDst(Ptr); - MachineLocation SPSrc(Ptr, StackOffset); - Moves.push_back(MachineMove(Label, SPDst, SPSrc)); + assert(StackSize); + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(Label, StackOffset)); StackOffset += stackGrowth; } } @@ -872,7 +606,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. - if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) { + if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetEnvMacho()) { const char *StackProbeSymbol; bool isSPUpdateNeeded = false; @@ -923,11 +657,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); - // MSVC x64's __chkstk needs to adjust %rsp. - // FIXME: %rax preserves the offset and should be available. - if (isSPUpdateNeeded) - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, - UseLEA, TII, *RegInfo); + // MSVC x64's __chkstk does not adjust %rsp itself. + // It also does not clobber %rax so we can reuse it when adjusting %rsp. + if (isSPUpdateNeeded) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) + .addReg(StackPtr) + .addReg(X86::RAX) + .setMIFlag(MachineInstr::FrameSetup); + } if (isEAXAlive) { // Restore EAX @@ -961,27 +698,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. - if (StackSize) { - MachineLocation SPDst(MachineLocation::VirtualFP); - MachineLocation SPSrc(MachineLocation::VirtualFP, - -StackSize + stackGrowth); - Moves.push_back(MachineMove(Label, SPDst, SPSrc)); - } else { - MachineLocation SPDst(StackPtr); - MachineLocation SPSrc(StackPtr, stackGrowth); - Moves.push_back(MachineMove(Label, SPDst, SPSrc)); - } + assert(StackSize); + MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset( + Label, -StackSize + stackGrowth)); } // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr); } - - // Darwin 10.7 and greater has support for compact unwind encoding. - if (STI.getTargetTriple().isMacOSX() && - !STI.getTargetTriple().isMacOSXVersionLT(10, 7)) - MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF)); } void X86FrameLowering::emitEpilogue(MachineFunction &MF, @@ -1336,7 +1061,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) { // create RETURNADDR area @@ -1349,7 +1074,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // } // [EBP] MFI->CreateFixedObject(-TailCallReturnAddrDelta, - (-1U*SlotSize)+TailCallReturnAddrDelta, true); + TailCallReturnAddrDelta - SlotSize, true); } if (hasFP(MF)) { diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 6e309d8..3d3b011 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -20,32 +20,6 @@ namespace llvm { -namespace CU { - - /// Compact unwind encoding values. - enum CompactUnwindEncodings { - /// [RE]BP based frame where [RE]BP is pused on the stack immediately after - /// the return address, then [RE]SP is moved to [RE]BP. - UNWIND_MODE_BP_FRAME = 0x01000000, - - /// A frameless function with a small constant stack size. - UNWIND_MODE_STACK_IMMD = 0x02000000, - - /// A frameless function with a large constant stack size. - UNWIND_MODE_STACK_IND = 0x03000000, - - /// No compact unwind encoding is available. - UNWIND_MODE_DWARF = 0x04000000, - - /// Mask for encoding the frame registers. - UNWIND_BP_FRAME_REGISTERS = 0x00007FFF, - - /// Mask for encoding the frameless registers. - UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF - }; - -} // end CU namespace - class MCSymbol; class X86TargetMachine; @@ -91,7 +65,6 @@ public: int getFrameIndexOffset(const MachineFunction &MF, int FI) const; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const; - uint32_t getCompactUnwindEncoding(MachineFunction &MF) const; void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 6fe6bd8..36d1690 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -79,7 +79,8 @@ namespace { } bool hasBaseOrIndexReg() const { - return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0; + return BaseType == FrameIndexBase || + IndexReg.getNode() != 0 || Base_Reg.getNode() != 0; } /// isRIPRelative - Return true if this addressing mode is already RIP @@ -141,10 +142,6 @@ namespace { /// SelectionDAG operations. /// class X86DAGToDAGISel : public SelectionDAGISel { - /// X86Lowering - This object fully describes how to lower LLVM code to an - /// X86-specific SelectionDAG. - const X86TargetLowering &X86Lowering; - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; @@ -156,7 +153,6 @@ namespace { public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), - X86Lowering(*tm.getTargetLowering()), Subtarget(&tm.getSubtarget<X86Subtarget>()), OptForSize(false) {} @@ -188,7 +184,7 @@ namespace { SDNode *Select(SDNode *N); SDNode *SelectGather(SDNode *N, unsigned Opc); SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); - SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); + SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT); bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); @@ -200,9 +196,13 @@ namespace { bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); + bool SelectMOV64Imm32(SDValue N, SDValue &Imm); bool SelectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); + bool SelectLEA64_32Addr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); bool SelectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); @@ -229,14 +229,15 @@ namespace { SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? - CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) : + CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, + getTargetLowering()->getPointerTy()) : AM.Base_Reg; Scale = getI8Imm(AM.Scale); Index = AM.IndexReg; // These are 32-bit even in 64-bit mode since RIP relative offset // is 32-bit. if (AM.GV) - Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(), + Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), MVT::i32, AM.Disp, AM.SymbolFlags); else if (AM.CP) @@ -373,7 +374,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, else Ops.push_back(Chain.getOperand(i)); SDValue NewChain = - CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(), + CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, &Ops[0], Ops.size()); Ops.clear(); Ops.push_back(NewChain); @@ -491,8 +492,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() { if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) continue; - EVT SrcVT = N->getOperand(0).getValueType(); - EVT DstVT = N->getValueType(0); + MVT SrcVT = N->getOperand(0).getSimpleValueType(); + MVT DstVT = N->getSimpleValueType(0); // If any of the sources are vectors, no fp stack involved. if (SrcVT.isVector() || DstVT.isVector()) @@ -500,8 +501,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. - bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT); - bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT); + const X86TargetLowering *X86Lowering = + static_cast<const X86TargetLowering *>(getTargetLowering()); + bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) continue; @@ -517,14 +520,14 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // Here we could have an FP stack truncation or an FPStack <-> SSE convert. // FPStack has extload and truncstore. SSE can fold direct loads into other // operations. Based on this, decide what we want to do. - EVT MemVT; + MVT MemVT; if (N->getOpcode() == ISD::FP_ROUND) MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. else MemVT = SrcIsSSE ? SrcVT : DstVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, @@ -781,8 +784,8 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, Mask != (0xffu << ScaleLog)) return true; - EVT VT = N.getValueType(); - DebugLoc DL = N.getDebugLoc(); + MVT VT = N.getSimpleValueType(); + SDLoc DL(N); SDValue Eight = DAG.getConstant(8, MVT::i8); SDValue NewMask = DAG.getConstant(0xff, VT); SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); @@ -829,8 +832,8 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) return true; - EVT VT = N.getValueType(); - DebugLoc DL = N.getDebugLoc(); + MVT VT = N.getSimpleValueType(); + SDLoc DL(N); SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); @@ -886,8 +889,8 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, return true; unsigned ShiftAmt = Shift.getConstantOperandVal(1); - unsigned MaskLZ = CountLeadingZeros_64(Mask); - unsigned MaskTZ = CountTrailingZeros_64(Mask); + unsigned MaskLZ = countLeadingZeros(Mask); + unsigned MaskTZ = countTrailingZeros(Mask); // The amount of shift we're trying to fit into the addressing mode is taken // from the trailing zeros of the mask. @@ -902,7 +905,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, // Scale the leading zero count down based on the actual size of the value. // Also scale it down based on the size of the shift. - MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt; + MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; // The final check is to ensure that any masked out high bits of X are // already known to be zero. Otherwise, the mask has a semantic impact @@ -912,31 +915,31 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, // replace them with zero extensions cheaply if necessary. bool ReplacingAnyExtend = false; if (X.getOpcode() == ISD::ANY_EXTEND) { - unsigned ExtendBits = - X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits(); + unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - + X.getOperand(0).getSimpleValueType().getSizeInBits(); // Assume that we'll replace the any-extend with a zero-extend, and // narrow the search to the extended value. X = X.getOperand(0); MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; ReplacingAnyExtend = true; } - APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(), - MaskLZ); + APInt MaskedHighBits = + APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); APInt KnownZero, KnownOne; DAG.ComputeMaskedBits(X, KnownZero, KnownOne); if (MaskedHighBits != KnownZero) return true; // We've identified a pattern that can be transformed into a single shift // and an addressing mode. Make it so. - EVT VT = N.getValueType(); + MVT VT = N.getSimpleValueType(); if (ReplacingAnyExtend) { assert(X.getValueType() != VT); // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. - SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X); + SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); InsertDAGNode(DAG, N, NewX); X = NewX; } - DebugLoc DL = N.getDebugLoc(); + SDLoc DL(N); SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8); SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8); @@ -960,7 +963,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { - DebugLoc dl = N.getDebugLoc(); + SDLoc dl(N); DEBUG({ dbgs() << "MatchAddress: "; AM.dump(); @@ -1057,7 +1060,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // We only handle up to 64-bit values here as those are what matter for // addressing mode optimizations. - if (X.getValueSizeInBits() > 64) break; + if (X.getSimpleValueType().getSizeInBits() > 64) break; // The mask used for the transform is expected to be post-shift, but we // found the shift first so just apply the shift to the mask before passing @@ -1242,7 +1245,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // We only handle up to 64-bit values here as those are what matter for // addressing mode optimizations. - if (X.getValueSizeInBits() > 64) break; + if (X.getSimpleValueType().getSizeInBits() > 64) break; if (!isa<ConstantSDNode>(N.getOperand(1))) break; @@ -1321,7 +1324,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, if (MatchAddress(N, AM)) return false; - EVT VT = N.getValueType(); + MVT VT = N.getSimpleValueType(); if (AM.BaseType == X86ISelAddressMode::RegBase) { if (!AM.Base_Reg.getNode()) AM.Base_Reg = CurDAG->getRegister(0, VT); @@ -1380,6 +1383,71 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, } +bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { + if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { + uint64_t ImmVal = CN->getZExtValue(); + if ((uint32_t)ImmVal != (uint64_t)ImmVal) + return false; + + Imm = CurDAG->getTargetConstant(ImmVal, MVT::i64); + return true; + } + + // In static codegen with small code model, we can get the address of a label + // into a register with 'movl'. TableGen has already made sure we're looking + // at a label of some kind. + assert(N->getOpcode() == X86ISD::Wrapper && + "Unexpected node type for MOV32ri64"); + N = N.getOperand(0); + + if (N->getOpcode() != ISD::TargetConstantPool && + N->getOpcode() != ISD::TargetJumpTable && + N->getOpcode() != ISD::TargetGlobalAddress && + N->getOpcode() != ISD::TargetExternalSymbol && + N->getOpcode() != ISD::TargetBlockAddress) + return false; + + Imm = N; + return TM.getCodeModel() == CodeModel::Small; +} + +bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment)) + return false; + + SDLoc DL(N); + RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); + if (RN && RN->getReg() == 0) + Base = CurDAG->getRegister(0, MVT::i64); + else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) { + // Base could already be %rip, particularly in the x32 ABI. + Base = SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), + Base, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + } + + RN = dyn_cast<RegisterSDNode>(Index); + if (RN && RN->getReg() == 0) + Index = CurDAG->getRegister(0, MVT::i64); + else { + assert(Index.getValueType() == MVT::i32 && + "Expect to be extending 32-bit registers for use in LEA"); + Index = SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), + Index, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + } + + return true; +} + /// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing /// mode it matches can be cost effectively emitted as an LEA instruction. bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, @@ -1398,7 +1466,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, assert (T == AM.Segment); AM.Segment = Copy; - EVT VT = N.getValueType(); + MVT VT = N.getSimpleValueType(); unsigned Complexity = 0; if (AM.BaseType == X86ISelAddressMode::RegBase) if (AM.Base_Reg.getNode()) @@ -1487,7 +1555,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, /// SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); - return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode(); + return CurDAG->getRegister(GlobalBaseReg, + getTargetLowering()->getPointerTy()).getNode(); } SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { @@ -1502,7 +1571,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), + SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), MVT::i32, MVT::i32, MVT::Other, Ops); cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1); return ResNode; @@ -1637,8 +1706,8 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { // + empty, the operand is not needed any more with the new op selected. // + non-empty, otherwise. static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, - DebugLoc dl, - enum AtomicOpc &Op, EVT NVT, + SDLoc dl, + enum AtomicOpc &Op, MVT NVT, SDValue Val) { if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) { int64_t CNVal = CN->getSExtValue(); @@ -1685,11 +1754,11 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, return Val; } -SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { +SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { if (Node->hasAnyUseOfValue(0)) return 0; - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); // Optimize common patterns for __sync_or_and_fetch and similar arith // operations where the result is not used. This allows us to use the "lock" @@ -1725,7 +1794,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); unsigned Opc = 0; - switch (NVT.getSimpleVT().SimpleTy) { + switch (NVT.SimpleTy) { default: return 0; case MVT::i8: if (isCN) @@ -1920,7 +1989,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, if (ChainCheck) // Make a new TokenFactor with all the other input chains except // for the load. - InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(), + InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, &ChainOps[0], ChainOps.size()); } if (!ChainCheck) @@ -1968,7 +2037,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { SDValue Segment = CurDAG->getRegister(0, MVT::i32); const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx, Disp, Segment, VMask, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), VTs, Ops); + SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), VTs, Ops); // Node has 2 outputs: VDst and MVT::Other. // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. // We replace VDst of Node with VDst of ResNode, and Other of Node with Other @@ -1979,10 +2048,10 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { } SDNode *X86DAGToDAGISel::Select(SDNode *Node) { - EVT NVT = Node->getValueType(0); + MVT NVT = Node->getSimpleValueType(0); unsigned Opc, MOpc; unsigned Opcode = Node->getOpcode(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n'); @@ -2014,6 +2083,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_avx2_gather_d_d_256: case Intrinsic::x86_avx2_gather_q_d: case Intrinsic::x86_avx2_gather_q_d_256: { + if (!Subtarget->hasAVX2()) + break; unsigned Opc; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); @@ -2118,7 +2189,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { break; unsigned ShlOp, Op; - EVT CstVT = NVT; + MVT CstVT = NVT; // Check the minimum bitwidth for the new constant. // TODO: AND32ri is the same as AND64ri32 with zext imm. @@ -2133,7 +2204,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (NVT == CstVT) break; - switch (NVT.getSimpleVT().SimpleTy) { + switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i32: assert(CstVT == MVT::i8); @@ -2170,7 +2241,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N1 = Node->getOperand(1); unsigned LoReg; - switch (NVT.getSimpleVT().SimpleTy) { + switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break; case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break; @@ -2199,7 +2270,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { bool isSigned = Opcode == ISD::SMUL_LOHI; bool hasBMI2 = Subtarget->hasBMI2(); if (!isSigned) { - switch (NVT.getSimpleVT().SimpleTy) { + switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break; case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break; @@ -2209,7 +2280,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break; } } else { - switch (NVT.getSimpleVT().SimpleTy) { + switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break; case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break; @@ -2336,9 +2407,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); } - // Propagate ordering to the last node, for now. - CurDAG->AssignOrdering(InFlag.getNode(), CurDAG->GetOrdering(Node)); - return NULL; } @@ -2349,7 +2417,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { bool isSigned = Opcode == ISD::SDIVREM; if (!isSigned) { - switch (NVT.getSimpleVT().SimpleTy) { + switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break; case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break; @@ -2357,7 +2425,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break; } } else { - switch (NVT.getSimpleVT().SimpleTy) { + switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break; case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break; @@ -2367,27 +2435,24 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } unsigned LoReg, HiReg, ClrReg; - unsigned ClrOpcode, SExtOpcode; - switch (NVT.getSimpleVT().SimpleTy) { + unsigned SExtOpcode; + switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; ClrReg = HiReg = X86::AH; - ClrOpcode = 0; SExtOpcode = X86::CBW; break; case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; - ClrOpcode = X86::MOV16r0; ClrReg = X86::DX; + ClrReg = X86::DX; SExtOpcode = X86::CWD; break; case MVT::i32: LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; - ClrOpcode = X86::MOV32r0; SExtOpcode = X86::CDQ; break; case MVT::i64: LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; - ClrOpcode = X86::MOV64r0; SExtOpcode = X86::CQO; break; } @@ -2425,8 +2490,29 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. - SDValue ClrNode = - SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0); + SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); + switch (NVT.SimpleTy) { + case MVT::i16: + ClrNode = + SDValue(CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, + CurDAG->getTargetConstant(X86::sub_16bit, MVT::i32)), + 0); + break; + case MVT::i32: + break; + case MVT::i64: + ClrNode = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), ClrNode, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + break; + default: + llvm_unreachable("Unexpected division source"); + } + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, ClrNode, InFlag).getValue(1); } @@ -2447,6 +2533,11 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Prevent use of AH in a REX instruction by referencing AX instead. // Shift it down 8 bits. + // + // The current assumption of the register allocator is that isel + // won't generate explicit references to the GPR8_NOREX registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. if (HiReg == X86::AH && Subtarget->is64Bit() && !SDValue(Node, 1).use_empty()) { SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, @@ -2520,7 +2611,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // On x86-32, only the ABCD registers have 8-bit subregisters. if (!Subtarget->is64Bit()) { const TargetRegisterClass *TRC; - switch (N0.getValueType().getSimpleVT().SimpleTy) { + switch (N0.getSimpleValueType().SimpleTy) { case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; default: llvm_unreachable("Unsupported TEST operand type!"); @@ -2555,7 +2646,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Put the value in an ABCD register. const TargetRegisterClass *TRC; - switch (N0.getValueType().getSimpleVT().SimpleTy) { + switch (N0.getSimpleValueType().SimpleTy) { case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break; case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; @@ -2667,7 +2758,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { EVT LdVT = LoadNode->getMemoryVT(); unsigned newOpc = getFusedLdStOpcode(LdVT, Opc); MachineSDNode *Result = CurDAG->getMachineNode(newOpc, - Node->getDebugLoc(), + SDLoc(Node), MVT::i32, MVT::Other, Ops); Result->setMemRefs(MemOp, MemOp + 2); diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index d5adb89..76eeb64 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16,6 +16,7 @@ #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86.h" +#include "X86CallingConv.h" #include "X86InstrBuilder.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" @@ -55,20 +56,17 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); // Forward declarations. -static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); -/// Generate a DAG to grab 128-bits from a vector > 128 bits. This -/// sets things up to match to an AVX VEXTRACTF128 instruction or a -/// simple subregister reference. Idx is an index in the 128 bits we -/// want. It need not be aligned to a 128-bit bounday. That makes -/// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, DebugLoc dl) { +static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl, + unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); EVT VT = Vec.getValueType(); - assert(VT.is256BitVector() && "Unexpected vector size!"); EVT ElVT = VT.getVectorElementType(); - unsigned Factor = VT.getSizeInBits()/128; + unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); @@ -76,13 +74,12 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, if (Vec.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(ResultVT); - // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR - // we can match to VEXTRACTF128. - unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); + // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR + unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); - // This is the index of the first element of the 128-bit chunk + // This is the index of the first element of the vectorWidth-bit chunk // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) * ElemsPerChunk); // If the input is a buildvector just emit a smaller one. @@ -95,38 +92,71 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, VecIdx); return Result; + +} +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 +/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 +/// instructions or a simple subregister reference. Idx is an index in the +/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert((Vec.getValueType().is256BitVector() || + Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); } -/// Generate a DAG to put 128-bits into a vector > 128 bits. This -/// sets things up to match to an AVX VINSERTF128 instruction or a -/// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit bounday. That makes -/// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - DebugLoc dl) { +/// Generate a DAG to grab 256-bits from a 512-bit vector. +static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); +} + +static SDValue InsertSubVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl, unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.getOpcode() == ISD::UNDEF) return Result; - EVT VT = Vec.getValueType(); - assert(VT.is128BitVector() && "Unexpected vector size!"); - EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); - // Insert the relevant 128 bits. - unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); + // Insert the relevant vectorWidth bits. + unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); - // This is the index of the first element of the 128-bit chunk + // This is the index of the first element of the vectorWidth-bit chunk // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) * ElemsPerChunk); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or +/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl) { + assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); +} + +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl) { + assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); +} /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of @@ -134,11 +164,18 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, /// large BUILD_VECTORS. static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } +static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert256BitVector(V, V2, NumElems/2, DAG, dl); +} + static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); bool is64Bit = Subtarget->is64Bit(); @@ -163,7 +200,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - RegInfo = TM.getRegisterInfo(); TD = getDataLayout(); resetOperationActions(); @@ -202,6 +238,8 @@ void X86TargetLowering::resetOperationActions() { setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2 @@ -562,10 +600,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } - setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); - setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); - setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); if (Subtarget->is64Bit()) { setExceptionPointerRegister(X86::RAX); setExceptionSelectorRegister(X86::RDX); @@ -585,10 +619,12 @@ void X86TargetLowering::resetOperationActions() { // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); - if (Subtarget->is64Bit()) { + if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { + // TargetInfo::X86_64ABIBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { + // TargetInfo::CharPtrBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Expand); setOperationAction(ISD::VACOPY , MVT::Other, Expand); } @@ -596,7 +632,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho()) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); else if (TM.Options.EnableSegmentedStacks) @@ -999,7 +1035,7 @@ void X86TargetLowering::resetOperationActions() { setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } - if (Subtarget->hasSSE41()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); @@ -1115,9 +1151,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); @@ -1125,7 +1158,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); @@ -1158,10 +1190,16 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); @@ -1262,6 +1300,152 @@ void X86TargetLowering::resetOperationActions() { } } + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); + addRegisterClass(MVT::v16f32, &X86::VR512RegClass); + addRegisterClass(MVT::v8i64, &X86::VR512RegClass); + addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + + addRegisterClass(MVT::v8i1, &X86::VK8RegClass); + addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); + setOperationAction(ISD::LOAD, MVT::v16f32, Legal); + setOperationAction(ISD::LOAD, MVT::v8f64, Legal); + setOperationAction(ISD::LOAD, MVT::v8i64, Legal); + setOperationAction(ISD::LOAD, MVT::v16i32, Legal); + setOperationAction(ISD::LOAD, MVT::v16i1, Legal); + + setOperationAction(ISD::FADD, MVT::v16f32, Legal); + setOperationAction(ISD::FSUB, MVT::v16f32, Legal); + setOperationAction(ISD::FMUL, MVT::v16f32, Legal); + setOperationAction(ISD::FDIV, MVT::v16f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); + setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + + setOperationAction(ISD::FADD, MVT::v8f64, Legal); + setOperationAction(ISD::FSUB, MVT::v8f64, Legal); + setOperationAction(ISD::FMUL, MVT::v8f64, Legal); + setOperationAction(ISD::FDIV, MVT::v8f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); + setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FMA, MVT::v8f64, Legal); + setOperationAction(ISD::FMA, MVT::v16f32, Legal); + setOperationAction(ISD::SDIV, MVT::v16i32, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); + } + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + + setOperationAction(ISD::TRUNCATE, MVT::i1, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + + setOperationAction(ISD::SETCC, MVT::v16i1, Custom); + setOperationAction(ISD::SETCC, MVT::v8i1, Custom); + + setOperationAction(ISD::MUL, MVT::v8i64, Custom); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::SELECT, MVT::v8f64, Custom); + setOperationAction(ISD::SELECT, MVT::v8i64, Custom); + setOperationAction(ISD::SELECT, MVT::v16f32, Custom); + + setOperationAction(ISD::ADD, MVT::v8i64, Legal); + setOperationAction(ISD::ADD, MVT::v16i32, Legal); + + setOperationAction(ISD::SUB, MVT::v8i64, Legal); + setOperationAction(ISD::SUB, MVT::v16i32, Legal); + + setOperationAction(ISD::MUL, MVT::v16i32, Legal); + + setOperationAction(ISD::SRL, MVT::v8i64, Custom); + setOperationAction(ISD::SRL, MVT::v16i32, Custom); + + setOperationAction(ISD::SHL, MVT::v8i64, Custom); + setOperationAction(ISD::SHL, MVT::v16i32, Custom); + + setOperationAction(ISD::SRA, MVT::v8i64, Custom); + setOperationAction(ISD::SRA, MVT::v16i32, Custom); + + setOperationAction(ISD::AND, MVT::v8i64, Legal); + setOperationAction(ISD::OR, MVT::v8i64, Legal); + setOperationAction(ISD::XOR, MVT::v8i64, Legal); + setOperationAction(ISD::AND, MVT::v16i32, Legal); + setOperationAction(ISD::OR, MVT::v16i32, Legal); + setOperationAction(ISD::XOR, MVT::v16i32, Legal); + + // Custom lower several nodes. + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + // Extract subvector is special because the value type + // (result) is 256/128-bit but the source is 512-bit wide. + if (VT.is128BitVector() || VT.is256BitVector()) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + + if (VT.getVectorElementType() == MVT::i1) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + + // Do not attempt to custom lower other non-512-bit vectors + if (!VT.is512BitVector()) + continue; + + if ( EltSize >= 32) { + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + } + } + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; + + // Do not attempt to promote non-256-bit vectors + if (!VT.is512BitVector()) + continue; + + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); + } + }// has AVX-512 + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. for (int VT = MVT::FIRST_VECTOR_VALUETYPE; @@ -1273,6 +1457,7 @@ void X86TargetLowering::resetOperationActions() { // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1361,8 +1546,17 @@ void X86TargetLowering::resetOperationActions() { setPrefFunctionAlignment(4); // 2^4 bytes. } -EVT X86TargetLowering::getSetCCResultType(EVT VT) const { - if (!VT.isVector()) return MVT::i8; +EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { + if (!VT.isVector()) + return MVT::i8; + + const TargetMachine &TM = getTargetMachine(); + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) + switch(VT.getVectorNumElements()) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + return VT.changeVectorElementTypeToInteger(); } @@ -1504,9 +1698,9 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) - // This doesn't have DebugLoc associated with it, but is not really the + // This doesn't have SDLoc associated with it, but is not really the // same as a Register. - return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); + return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); return Table; } @@ -1571,6 +1765,13 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, return true; } +bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + assert(SrcAS != DestAS && "Expected different address spaces!"); + + return SrcAS < 256 && DestAS < 256; +} + //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -1588,12 +1789,17 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, return CCInfo.CheckReturn(Outs, RetCC_X86); } +const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { + static const uint16_t ScratchRegs[] = { X86::R11, 0 }; + return ScratchRegs; +} + SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); @@ -1761,7 +1967,7 @@ SDValue X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { // Assign locations to each value returned by this call. @@ -1868,7 +2074,7 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), @@ -1912,7 +2118,7 @@ SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, unsigned i) const { @@ -1954,7 +2160,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { @@ -2008,12 +2214,18 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT.is512BitVector()) + RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; + else if (RegVT == MVT::v8i1) + RC = &X86::VK8RegClass; + else if (RegVT == MVT::v16i1) + RC = &X86::VK16RegClass; else llvm_unreachable("Unknown argument type!"); @@ -2230,7 +2442,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); @@ -2250,7 +2462,7 @@ SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, - int FPDiff, DebugLoc dl) const { + int FPDiff, SDLoc dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(); OutRetAddr = getReturnAddressFrameIndex(DAG); @@ -2266,12 +2478,13 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, static SDValue EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, - unsigned SlotSize, int FPDiff, DebugLoc dl) { + unsigned SlotSize, int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = - MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); + MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, + false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(NewReturnAddrFI), @@ -2283,10 +2496,10 @@ SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { SelectionDAG &DAG = CLI.DAG; - DebugLoc &dl = CLI.DL; - SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; - SmallVector<SDValue, 32> &OutVals = CLI.OutVals; - SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDLoc &dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; @@ -2358,7 +2571,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!IsSibcall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); SDValue RetAddrFrIdx; // Load return address for tail calls. @@ -2372,6 +2586,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); @@ -2447,7 +2663,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), - DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of @@ -2644,7 +2860,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag); + DAG.getIntPtrConstant(0, true), InFlag, dl); InFlag = Chain.getValue(1); } @@ -2703,7 +2919,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(NumBytesForCalleeToPush, true), - InFlag); + InFlag, dl); InFlag = Chain.getValue(1); } @@ -2751,6 +2967,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); const TargetFrameLowering &TFI = *TM.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; @@ -2864,6 +3082,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3066,7 +3286,7 @@ static bool isTargetShuffle(unsigned Opcode) { } } -static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3077,7 +3297,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3091,7 +3311,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3104,7 +3324,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3123,13 +3343,16 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); - ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, + ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, + -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } @@ -3336,7 +3559,7 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. -static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { +static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { if (VT == MVT::v4f32 || VT == MVT::v4i32 ) return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -3346,7 +3569,7 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { +static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; @@ -3375,7 +3598,7 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { +static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; @@ -3404,14 +3627,14 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PALIGNR. -static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, +static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, const X86Subtarget *Subtarget) { if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || (VT.is256BitVector() && !Subtarget->hasInt256())) return false; unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; // Do not handle 64-bit element shuffles with palignr. @@ -3494,10 +3717,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, /// specifies a shuffle of elements that is suitable for input to 128/256-bit /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be /// reverse of what x86 shuffles want. -static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, - bool Commuted = false) { - if (!HasFp256 && VT.is256BitVector()) - return false; +static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) { unsigned NumElems = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; @@ -3506,6 +3726,10 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, if (NumLaneElems != 2 && NumLaneElems != 4) return false; + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + bool symetricMaskRequired = + (VT.getSizeInBits() >= 256) && (EltSize == 32); + // VSHUFPSY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. @@ -3525,6 +3749,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, // // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // + SmallVector<int, 4> MaskVal(NumLaneElems, -1); unsigned HalfLaneElems = NumLaneElems/2; for (unsigned l = 0; l != NumElems; l += NumLaneElems) { for (unsigned i = 0; i != NumLaneElems; ++i) { @@ -3535,9 +3760,13 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, // For VSHUFPSY, the mask of the second half must be the same as the // first but with the appropriate offsets. This works in the same way as // VPERMILPS works with masks. - if (NumElems != 8 || l == 0 || Mask[i] < 0) + if (!symetricMaskRequired || Idx < 0) + continue; + if (MaskVal[i] < 0) { + MaskVal[i] = Idx - l; continue; - if (!isUndefOrEqual(Idx, Mask[i]+l)) + } + if ((signed)(Idx - l) != MaskVal[i]) return false; } } @@ -3547,7 +3776,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256, /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. -static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -3566,7 +3795,7 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> -static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -3583,7 +3812,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. -static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -3605,7 +3834,7 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. -static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -3631,8 +3860,8 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { static SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); - DebugLoc dl = SVOp->getDebugLoc(); + MVT VT = SVOp->getSimpleValueType(0); + SDLoc dl(SVOp); if (VT != MVT::v8i32 && VT != MVT::v8f32) return SDValue(); @@ -3674,73 +3903,92 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. -static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, +static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { - unsigned NumElts = VT.getVectorNumElements(); - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for unpckh"); + assert(VT.getSizeInBits() >= 128 && + "Unsupported vector type for unpckl"); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes; + unsigned NumOf256BitLanes; + unsigned NumElts = VT.getVectorNumElements(); + if (VT.is256BitVector()) { + if (NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; + NumLanes = 2; + NumOf256BitLanes = 1; + } else if (VT.is512BitVector()) { + assert(VT.getScalarType().getSizeInBits() >= 32 && + "Unsupported vector type for unpckh"); + NumLanes = 2; + NumOf256BitLanes = 2; + } else { + NumLanes = 1; + NumOf256BitLanes = 1; + } - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; + unsigned NumEltsInStride = NumElts/NumOf256BitLanes; + unsigned NumLaneElts = NumEltsInStride/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; - i != (l+1)*NumLaneElts; - i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (!isUndefOrEqual(BitI1, NumElts)) + for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { + for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l256*NumEltsInStride+l+i]; + int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; + if (!isUndefOrEqual(BitI, j+l256*NumElts)) return false; - } else { - if (!isUndefOrEqual(BitI1, j + NumElts)) + if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + return false; + if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) return false; } } } - return true; } /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. -static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, +static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { - unsigned NumElts = VT.getVectorNumElements(); - - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes; + unsigned NumOf256BitLanes; + unsigned NumElts = VT.getVectorNumElements(); + if (VT.is256BitVector()) { + if (NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; + NumLanes = 2; + NumOf256BitLanes = 1; + } else if (VT.is512BitVector()) { + assert(VT.getScalarType().getSizeInBits() >= 32 && + "Unsupported vector type for unpckh"); + NumLanes = 2; + NumOf256BitLanes = 2; + } else { + NumLanes = 1; + NumOf256BitLanes = 1; + } - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; + unsigned NumEltsInStride = NumElts/NumOf256BitLanes; + unsigned NumLaneElts = NumEltsInStride/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; - i != (l+1)*NumLaneElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (isUndefOrEqual(BitI1, NumElts)) + for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { + for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l256*NumEltsInStride+l+i]; + int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; + if (!isUndefOrEqual(BitI, j+l256*NumElts)) return false; - } else { - if (!isUndefOrEqual(BitI1, j+NumElts)) + if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + return false; + if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) return false; } } @@ -3751,10 +3999,12 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> -static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { +static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); bool Is256BitVec = VT.is256BitVector(); + if (VT.is512BitVector()) + return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -3774,12 +4024,10 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; - i != (l+1)*NumLaneElts; - i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; @@ -3794,9 +4042,12 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, /// <2, 2, 3, 3> -static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { +static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); + if (VT.is512BitVector()) + return false; + assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); @@ -3809,11 +4060,10 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; - i != (l+1)*NumLaneElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (!isUndefOrEqual(BitI1, j)) @@ -3850,7 +4100,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { +static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { if (!HasFp256 || !VT.is256BitVector()) return false; @@ -3882,7 +4132,7 @@ static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); unsigned HalfSize = VT.getVectorNumElements()/2; @@ -3903,6 +4153,44 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { return (FstHalf | (SndHalf << 4)); } +// Symetric in-lane mask. Each lane has 4 elements (for imm8) +static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize < 32) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + Imm8 = 0; + if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { + for (unsigned i = 0; i != NumElts; ++i) { + if (Mask[i] < 0) + continue; + Imm8 |= Mask[i] << (i*2); + } + return true; + } + + unsigned LaneSize = 4; + SmallVector<int, 4> MaskVal(LaneSize, -1); + + for (unsigned l = 0; l != NumElts; l += LaneSize) { + for (unsigned i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) + return false; + if (Mask[i+l] < 0) + continue; + if (MaskVal[i] < 0) { + MaskVal[i] = Mask[i+l] - l; + Imm8 |= MaskVal[i] << (i*2); + continue; + } + if (Mask[i+l] != (signed)(MaskVal[i]+l)) + return false; + } + } + return true; +} + /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying @@ -3910,38 +4198,39 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. -static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { - if (!HasFp256) +static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (VT.getSizeInBits() < 256 || EltSize < 32) return false; - + bool symetricMaskRequired = (EltSize == 32); unsigned NumElts = VT.getVectorNumElements(); - // Only match 256-bit with 32/64-bit types - if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8)) - return false; unsigned NumLanes = VT.getSizeInBits()/128; unsigned LaneSize = NumElts/NumLanes; + // 2 or 4 elements in one lane + + SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1); for (unsigned l = 0; l != NumElts; l += LaneSize) { for (unsigned i = 0; i != LaneSize; ++i) { if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; - if (NumElts != 8 || l == 0) - continue; - // VPERMILPS handling - if (Mask[i] < 0) - continue; - if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) - return false; + if (symetricMaskRequired) { + if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { + ExpectedMaskVal[i] = Mask[i+l] - l; + continue; + } + if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) + return false; + } } } - return true; } /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. -static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, +static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { if (!VT.is128BitVector()) return false; @@ -3965,7 +4254,7 @@ static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> -static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, +static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT, const X86Subtarget *Subtarget) { if (!Subtarget->hasSSE3()) return false; @@ -3973,7 +4262,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8)) + (VT.is256BitVector() && NumElems != 8) || + (VT.is512BitVector() && NumElems != 16)) return false; // "i+1" is the value the indexed mask element must have @@ -3988,7 +4278,7 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> -static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, +static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT, const X86Subtarget *Subtarget) { if (!Subtarget->hasSSE3()) return false; @@ -3996,7 +4286,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8)) + (VT.is256BitVector() && NumElems != 8) || + (VT.is512BitVector() && NumElems != 16)) return false; // "i" is the value the indexed mask element must have @@ -4011,7 +4302,7 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. -static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { +static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { if (!HasFp256 || !VT.is256BitVector()) return false; @@ -4031,7 +4322,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) { /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. -static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { +static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -4045,49 +4336,66 @@ static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { return true; } -/// isVEXTRACTF128Index - Return true if the specified +/// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is -/// suitable for input to VEXTRACTF128. -bool X86::isVEXTRACTF128Index(SDNode *N) { +/// suitable for instruction that extract 128 or 256 bit vectors +static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) return false; - // The index should be aligned on a 128-bit boundary. + // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); - bool Result = (Index * ElSize) % 128 == 0; + bool Result = (Index * ElSize) % vecWidth == 0; return Result; } -/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR +/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to -/// VINSERTF128. -bool X86::isVINSERTF128Index(SDNode *N) { +/// insertion of 128 or 256-bit subvectors +static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) return false; - - // The index should be aligned on a 128-bit boundary. + // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); - bool Result = (Index * ElSize) % 128 == 0; + bool Result = (Index * ElSize) % vecWidth == 0; return Result; } +bool X86::isVINSERT128Index(SDNode *N) { + return isVINSERTIndex(N, 128); +} + +bool X86::isVINSERT256Index(SDNode *N) { + return isVINSERTIndex(N, 256); +} + +bool X86::isVEXTRACT128Index(SDNode *N) { + return isVEXTRACTIndex(N, 128); +} + +bool X86::isVEXTRACT256Index(SDNode *N) { + return isVEXTRACTIndex(N, 256); +} + /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. /// Handles 128-bit and 256-bit. static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"); // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate @@ -4096,10 +4404,10 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - assert((NumLaneElts == 2 || NumLaneElts == 4) && - "Only supports 2 or 4 elements per lane"); + assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && + "Only supports 2, 4 or 8 elements per lane"); - unsigned Shift = (NumLaneElts == 4) ? 1 : 0; + unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; unsigned Mask = 0; for (unsigned i = 0; i != NumElts; ++i) { int Elt = N->getMaskElt(i); @@ -4115,7 +4423,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); @@ -4139,7 +4447,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); + MVT VT = N->getSimpleValueType(0); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); @@ -4163,11 +4471,12 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); - unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; + MVT VT = SVOp->getSimpleValueType(0); + unsigned EltSize = VT.is512BitVector() ? 1 : + VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; int Val = 0; @@ -4184,61 +4493,64 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { return (Val - i) * EltSize; } -/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 -/// instructions. -unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { +static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) - llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); + llvm_unreachable("Illegal extract subvector for VEXTRACT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - MVT VecVT = N->getOperand(0).getValueType().getSimpleVT(); + MVT VecVT = N->getOperand(0).getSimpleValueType(); MVT ElVT = VecVT.getVectorElementType(); - unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } -/// getInsertVINSERTF128Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 -/// instructions. -unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { +static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) - llvm_unreachable("Illegal insert subvector for VINSERTF128"); + llvm_unreachable("Illegal insert subvector for VINSERT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); - MVT VecVT = N->getValueType(0).getSimpleVT(); + MVT VecVT = N->getSimpleValueType(0); MVT ElVT = VecVT.getVectorElementType(); - unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } -/// getShuffleCLImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions. -/// Handles 256-bit. -static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getValueType(0).getSimpleVT(); - - unsigned NumElts = VT.getVectorNumElements(); +/// getExtractVEXTRACT128Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 +/// and VINSERTI128 instructions. +unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { + return getExtractVEXTRACTImmediate(N, 128); +} - assert((VT.is256BitVector() && NumElts == 4) && - "Unsupported vector type for VPERMQ/VPERMPD"); +/// getExtractVEXTRACT256Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 +/// and VINSERTI64x4 instructions. +unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { + return getExtractVEXTRACTImmediate(N, 256); +} - unsigned Mask = 0; - for (unsigned i = 0; i != NumElts; ++i) { - int Elt = N->getMaskElt(i); - if (Elt < 0) - continue; - Mask |= Elt << (i*2); - } +/// getInsertVINSERT128Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 +/// and VINSERTI128 instructions. +unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { + return getInsertVINSERTImmediate(N, 128); +} - return Mask; +/// getInsertVINSERT256Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 +/// and VINSERTI64x4 instructions. +unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { + return getInsertVINSERTImmediate(N, 256); } + /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { @@ -4253,7 +4565,7 @@ bool X86::isZeroNode(SDValue Elt) { /// their permute mask. static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> MaskVec; @@ -4267,7 +4579,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, } MaskVec.push_back(Idx); } - return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), + return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), SVOp->getOperand(0), &MaskVec[0]); } @@ -4275,7 +4587,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). -static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { +static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; if (VT.getVectorNumElements() != 4) @@ -4332,7 +4644,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) { /// half of V2 (and in order). And since V1 will become the source of the /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, - ArrayRef<int> Mask, EVT VT) { + ArrayRef<int> Mask, MVT VT) { if (!VT.is128BitVector()) return false; @@ -4400,7 +4712,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { /// getZeroVector - Returns a vector of specified type with all zero elements. /// static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, - SelectionDAG &DAG, DebugLoc dl) { + SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted @@ -4428,6 +4740,11 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, array_lengthof(Ops)); } + } else if (VT.is512BitVector()) { // AVX-512 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16); } else llvm_unreachable("Unexpected vector type"); @@ -4439,7 +4756,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, - DebugLoc dl) { + SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); @@ -4473,7 +4790,7 @@ static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. -static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; @@ -4484,7 +4801,7 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, } /// getUnpackl - Returns a vector_shuffle node for an unpackl operation. -static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, +static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; @@ -4496,7 +4813,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, } /// getUnpackh - Returns a vector_shuffle node for an unpackh operation. -static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, +static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> Mask; @@ -4512,9 +4829,9 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, // Generate shuffles which repeat i16 and i8 several times until they can be // represented by v4f32 and then be manipulated by target suported shuffles. static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { - EVT VT = V.getValueType(); + MVT VT = V.getSimpleValueType(); int NumElems = VT.getVectorNumElements(); - DebugLoc dl = V.getDebugLoc(); + SDLoc dl(V); while (NumElems > 4) { if (EltNo < NumElems/2) { @@ -4530,8 +4847,8 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { /// getLegalSplat - Generate a legal splat with supported x86 shuffles static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { - EVT VT = V.getValueType(); - DebugLoc dl = V.getDebugLoc(); + MVT VT = V.getSimpleValueType(); + SDLoc dl(V); if (VT.is128BitVector()) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); @@ -4556,9 +4873,9 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { /// PromoteSplat - Splat is promoted to target supported vector shuffles. static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { - EVT SrcVT = SV->getValueType(0); + MVT SrcVT = SV->getSimpleValueType(0); SDValue V1 = SV->getOperand(0); - DebugLoc dl = SV->getDebugLoc(); + SDLoc dl(SV); int EltNo = SV->getSplatIndex(); int NumElems = SrcVT.getVectorNumElements(); @@ -4579,7 +4896,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // instruction because the target has no such instruction. Generate shuffles // which repeat i16 and i8 several times until they fit in i32, and then can // be manipulated by target suported shuffles. - EVT EltVT = SrcVT.getVectorElementType(); + MVT EltVT = SrcVT.getVectorElementType(); if (EltVT == MVT::i8 || EltVT == MVT::i16) V1 = PromoteSplati8i16(V1, DAG, EltNo); @@ -4601,15 +4918,15 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = V2.getValueType(); + MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero - ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); + ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 16> MaskVec; for (unsigned i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec.push_back(i == Idx ? NumElems : i); - return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); + return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the @@ -4719,7 +5036,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { - MVT ShufVT = V.getValueType().getSimpleVT(); + MVT ShufVT = V.getSimpleValueType(); unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; bool IsUnary; @@ -4760,19 +5077,27 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, /// getNumOfConsecutiveZeros - Return the number of elements of a vector /// shuffle operation which come from a consecutively from a zero. The /// search can start in two different directions, from left or right. -static -unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, - bool ZerosFromLeft, SelectionDAG &DAG) { - unsigned i; - for (i = 0; i != NumElems; ++i) { - unsigned Index = ZerosFromLeft ? i : NumElems-i-1; +/// We count undefs as zeros until PreferredNum is reached. +static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, + unsigned NumElems, bool ZerosFromLeft, + SelectionDAG &DAG, + unsigned PreferredNum = -1U) { + unsigned NumZeros = 0; + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); - if (!(Elt.getNode() && - (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) + if (!Elt.getNode()) + break; + + if (X86::isZeroNode(Elt)) + ++NumZeros; + else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. + NumZeros = std::min(NumZeros + 1, PreferredNum); + else break; } - return i; + return NumZeros; } /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) @@ -4809,9 +5134,11 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, /// logical left shift of a vector. static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, - false /* check zeros from right */, DAG); + unsigned NumElems = + SVOp->getSimpleValueType(0).getVectorNumElements(); + unsigned NumZeros = getNumOfConsecutiveZeros( + SVOp, NumElems, false /* check zeros from right */, DAG, + SVOp->getMaskElt(0)); unsigned OpSrc; if (!NumZeros) @@ -4842,9 +5169,11 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, /// logical left shift of a vector. static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, - true /* check zeros from left */, DAG); + unsigned NumElems = + SVOp->getSimpleValueType(0).getVectorNumElements(); + unsigned NumZeros = getNumOfConsecutiveZeros( + SVOp, NumElems, true /* check zeros from left */, DAG, + NumElems - SVOp->getMaskElt(NumElems - 1) - 1); unsigned OpSrc; if (!NumZeros) @@ -4877,7 +5206,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { // Although the logic below support any bitwidth size, there are no // shift instructions which handle more than 128-bit vectors. - if (!SVOp->getValueType(0).is128BitVector()) + if (!SVOp->getSimpleValueType(0).is128BitVector()) return false; if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || @@ -4897,7 +5226,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, if (NumNonZero > 8) return SDValue(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue V(0, 0); bool First = true; for (unsigned i = 0; i < 16; ++i) { @@ -4945,7 +5274,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, if (NumNonZero > 4) return SDValue(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue V(0, 0); bool First = true; for (unsigned i = 0; i < 8; ++i) { @@ -4971,7 +5300,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, - const TargetLowering &TLI, DebugLoc dl) { + const TargetLowering &TLI, SDLoc dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; @@ -4982,9 +5311,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); } -SDValue -X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, - SelectionDAG &DAG) const { +static SDValue +LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into @@ -5036,7 +5364,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, return SDValue(); int64_t StartOffset = Offset & ~(RequiredAlign-1); if (StartOffset) - Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), + Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); int EltNo = (Offset - StartOffset) >> 2; @@ -5067,7 +5395,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, - DebugLoc &DL, SelectionDAG &DAG) { + SDLoc &DL, SelectionDAG &DAG, + bool isAfterLegalize) { EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); @@ -5103,15 +5432,33 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + + if (isAfterLegalize && + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + + SDValue NewLd = SDValue(); + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) - return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), 0); - return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), LDBase->getAlignment()); + NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), 0); + NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), LDBase->getAlignment()); + + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + } + + return NewLd; } if (NumElems == 4 && LastLoadedElt == 1 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { @@ -5148,15 +5495,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. -SDValue -X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, + SelectionDAG &DAG) { if (!Subtarget->hasFp256()) return SDValue(); - MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); SDValue Ld; @@ -5200,7 +5547,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { return SDValue(); // Use the register form of the broadcast instruction available on AVX2. - if (VT.is256BitVector()) + if (VT.getSizeInBits() >= 256) Sc = Extract128BitVector(Sc, 0, DAG, dl); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); } @@ -5212,13 +5559,18 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { // The scalar_to_vector node and the suspected // load node must have exactly one user. // Constants may have multiple users. - if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) + + // AVX-512 has register version of the broadcast + bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && + Ld.getValueType().getSizeInBits() >= 32; + if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && + !hasRegVer)) return SDValue(); break; } } - bool Is256 = VT.is256BitVector(); + bool IsGE256 = (VT.getSizeInBits() >= 256); // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector @@ -5228,7 +5580,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { assert(!CVT.isVector() && "Must not broadcast a vector type"); unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { const Constant *C = 0; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -5237,7 +5589,8 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { assert(C && "Invalid constant type"); - SDValue CP = DAG.getConstantPool(C, getPointerTy()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), @@ -5252,14 +5605,14 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && - (ScalarSize == 32 || (Is256 && ScalarSize == 64))) + (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); - if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match @@ -5273,15 +5626,15 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue -X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); +static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. - if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; @@ -5347,19 +5700,128 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { return NV; } +// Lower BUILD_VECTOR operation for v8i1 and v16i1 types. +SDValue +X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { + + MVT VT = Op.getSimpleValueType(); + assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && + "Unexpected type in LowerBUILD_VECTORvXi1!"); + + SDLoc dl(Op); + if (ISD::isBuildVectorAllZeros(Op.getNode())) { + SDValue Cst = DAG.getTargetConstant(0, MVT::i1); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + Ops, VT.getVectorNumElements()); + } + + if (ISD::isBuildVectorAllOnes(Op.getNode())) { + SDValue Cst = DAG.getTargetConstant(1, MVT::i1); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + Ops, VT.getVectorNumElements()); + } + + bool AllContants = true; + uint64_t Immediate = 0; + for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { + SDValue In = Op.getOperand(idx); + if (In.getOpcode() == ISD::UNDEF) + continue; + if (!isa<ConstantSDNode>(In)) { + AllContants = false; + break; + } + if (cast<ConstantSDNode>(In)->getZExtValue()) + Immediate |= (1ULL << idx); + } + + if (AllContants) { + SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, + DAG.getConstant(Immediate, MVT::i16)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, + DAG.getIntPtrConstant(0)); + } + + // Splat vector (with undefs) + SDValue In = Op.getOperand(0); + for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) { + if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF) + llvm_unreachable("Unsupported predicate operation"); + } + + SDValue EFLAGS, X86CC; + if (In.getOpcode() == ISD::SETCC) { + SDValue Op0 = In.getOperand(0); + SDValue Op1 = In.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get(); + bool isFP = Op1.getValueType().isFloatingPoint(); + unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + + assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation"); + + X86CC = DAG.getConstant(X86CCVal, MVT::i8); + EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); + } else if (In.getOpcode() == X86ISD::SETCC) { + X86CC = In.getOperand(0); + EFLAGS = In.getOperand(1); + } else { + // The algorithm: + // Bit1 = In & 0x1 + // if (Bit1 != 0) + // ZF = 0 + // else + // ZF = 1 + // if (ZF == 0) + // res = allOnes ### CMOVNE -1, %res + // else + // res = allZero + MVT InVT = In.getSimpleValueType(); + SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT)); + EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG); + X86CC = DAG.getConstant(X86::COND_NE, MVT::i8); + } + + if (VT == MVT::v16i1) { + SDValue Cst1 = DAG.getConstant(-1, MVT::i16); + SDValue Cst0 = DAG.getConstant(0, MVT::i16); + SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16, + Cst0, Cst1, X86CC, EFLAGS); + return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); + } + + if (VT == MVT::v8i1) { + SDValue Cst1 = DAG.getConstant(-1, MVT::i32); + SDValue Cst0 = DAG.getConstant(0, MVT::i32); + SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32, + Cst0, Cst1, X86CC, EFLAGS); + CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp); + return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp); + } + llvm_unreachable("Unsupported predicate operation"); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); + // Generate vectors for predicate vectors. + if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) + return LowerBUILD_VECTORvXi1(Op, DAG); + // Vectors containing all zeros can be matched by pxor and xorps later if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. - if (VT == MVT::v4i32 || VT == MVT::v8i32) + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, dl); @@ -5372,10 +5834,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; - return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); + if (!VT.is512BitVector()) + return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } - SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); if (Broadcast.getNode()) return Broadcast; @@ -5408,7 +5871,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { - unsigned Idx = CountTrailingZeros_32(NonZeros); + unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If this is an insertion of an i64 value on x86-32, and if the top bits of @@ -5454,7 +5917,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { - if (VT.is256BitVector()) { + if (VT.is256BitVector() || VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); @@ -5517,7 +5980,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> - unsigned Idx = CountTrailingZeros_32(NonZeros); + unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); @@ -5552,7 +6015,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. - unsigned Idx = CountTrailingZeros_32(NonZeros); + unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); @@ -5619,7 +6082,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. - SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); if (LD.getNode()) return LD; @@ -5682,22 +6145,25 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); - MVT ResVT = Op.getValueType().getSimpleVT(); + SDLoc dl(Op); + MVT ResVT = Op.getSimpleValueType(); - assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); + assert((ResVT.is256BitVector() || + ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); + if(ResVT.is256BitVector()) + return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); - return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); + return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 2); - // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors + // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. return LowerAVXCONCAT_VECTORS(Op, DAG); } @@ -5708,11 +6174,15 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); - MVT VT = SVOp->getValueType(0).getSimpleVT(); + SDLoc dl(SVOp); + MVT VT = SVOp->getSimpleValueType(0); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) return SDValue(); if (!Subtarget->hasInt256() && VT == MVT::v16i16) @@ -5769,7 +6239,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); SmallVector<int, 8> MaskVals; // Determine if more than 1 of the words in each of the low and high quadwords @@ -6018,13 +6488,13 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, // 1. [ssse3] 1 x pshufb // 2. [ssse3] 2 x pshufb + 1 x por // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw -static -SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG, - const X86TargetLowering &TLI) { +static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, + const X86Subtarget* Subtarget, + SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); ArrayRef<int> MaskVals = SVOp->getMask(); // Promote splats to a larger type which usually leads to more efficient code. @@ -6037,7 +6507,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // present, fall back to case 3. // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (TLI.getSubtarget()->hasSSSE3()) { + if (Subtarget->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If all result elements are from one input vector, then only translate @@ -6150,10 +6620,10 @@ static SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; @@ -6198,8 +6668,8 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - MVT VT = SVOp->getValueType(0).getSimpleVT(); - DebugLoc dl = SVOp->getDebugLoc(); + MVT VT = SVOp->getSimpleValueType(0); + SDLoc dl(SVOp); unsigned NumElems = VT.getVectorNumElements(); MVT NewVT; unsigned Scale; @@ -6235,9 +6705,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, /// getVZextMovL - Return a zero-extending vector move low node. /// -static SDValue getVZextMovL(MVT VT, EVT OpVT, +static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, - const X86Subtarget *Subtarget, DebugLoc dl) { + const X86Subtarget *Subtarget, SDLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { LoadSDNode *LD = NULL; if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) @@ -6277,12 +6747,12 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { if (NewOp.getNode()) return NewOp; - MVT VT = SVOp->getValueType(0).getSimpleVT(); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); unsigned NumLaneElems = NumElems / 2; - DebugLoc dl = SVOp->getDebugLoc(); + SDLoc dl(SVOp); MVT EltVT = VT.getVectorElementType(); MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); SDValue Output[2]; @@ -6388,8 +6858,8 @@ static SDValue LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - DebugLoc dl = SVOp->getDebugLoc(); - MVT VT = SVOp->getValueType(0).getSimpleVT(); + SDLoc dl(SVOp); + MVT VT = SVOp->getSimpleValueType(0); assert(VT.is128BitVector() && "Unsupported vector size"); @@ -6539,8 +7009,8 @@ static bool MayFoldVectorLoad(SDValue V) { } static -SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); +SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); // Canonizalize to v2f64. V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); @@ -6550,11 +7020,11 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { } static -SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, +SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); assert(VT != MVT::v2i64 && "unsupported shuffle type"); @@ -6569,10 +7039,10 @@ SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, } static -SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { +SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); assert((VT == MVT::v4i32 || VT == MVT::v4f32) && "unsupported shuffle type"); @@ -6585,10 +7055,10 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { } static -SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { +SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); unsigned NumElems = VT.getVectorNumElements(); // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second @@ -6642,20 +7112,20 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { } // Reduce a vector shuffle to zext. -SDValue -X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { // PMOVZX is only available from SSE41. if (!Subtarget->hasSSE41()) return SDValue(); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); // Only AVX2 support 256-bit vector integer extending. if (!Subtarget->hasInt256() && VT.is256BitVector()) return SDValue(); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = VT.getVectorNumElements(); @@ -6687,12 +7157,11 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } - LLVMContext *Context = DAG.getContext(); unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; - EVT NeVT = EVT::getIntegerVT(*Context, NBits); - EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift); + MVT NeVT = MVT::getIntegerVT(NBits); + MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); - if (!isTypeLegal(NVT)) + if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) return SDValue(); // Simplify the operand as it's prepared to be fed into shuffle. @@ -6700,8 +7169,8 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { if (V1.getOpcode() == ISD::BITCAST && V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - V1.getOperand(0) - .getOperand(0).getValueType().getSizeInBits() == SignificantBits) { + V1.getOperand(0).getOperand(0) + .getSimpleValueType().getSizeInBits() == SignificantBits) { // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); ConstantSDNode *CIdx = @@ -6710,19 +7179,19 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { // selection to fold it. Otherwise, we will short the conversion sequence. if (CIdx && CIdx->getZExtValue() == 0 && (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { - if (V.getValueSizeInBits() > V1.getValueSizeInBits()) { + MVT FullVT = V.getSimpleValueType(); + MVT V1VT = V1.getSimpleValueType(); + if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { // The "ext_vec_elt" node is wider than the result node. // In this case we should extract subvector from V. // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). - unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits(); - EVT FullVT = V.getValueType(); - EVT SubVecVT = EVT::getVectorVT(*Context, - FullVT.getVectorElementType(), + unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); + MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), FullVT.getVectorNumElements()/Ratio); - V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, DAG.getIntPtrConstant(0)); } - V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); + V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); } } @@ -6730,11 +7199,12 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } -SDValue -X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { +static SDValue +NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); @@ -6744,13 +7214,13 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // Handle splat operations if (SVOp->isSplat()) { // Use vbroadcast whenever the splat comes from a foldable load - SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); if (Broadcast.getNode()) return Broadcast; } // Check integer expanding shuffles. - SDValue NewOp = LowerVectorIntExtend(Op, DAG); + SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; @@ -6768,7 +7238,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { - MVT NewVT = NewOp.getValueType().getSimpleVT(); + MVT NewVT = NewOp.getSimpleValueType(); if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT, true, false)) return getVZextMovL(VT, NewVT, NewOp.getOperand(0), @@ -6777,7 +7247,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); if (NewOp.getNode()) { - MVT NewVT = NewOp.getValueType().getSimpleVT(); + MVT NewVT = NewOp.getSimpleValueType(); if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, dl); @@ -6792,8 +7262,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); unsigned NumElems = VT.getVectorNumElements(); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; @@ -6830,7 +7300,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Normalize the input vectors. Here splats, zeroed vectors, profitable // narrowing and commutation of operands should be handled. The actual code // doesn't include all of those, work in progress... - SDValue NewOp = NormalizeVectorShuffle(Op, DAG); + SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; @@ -6875,6 +7345,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { TargetMask, DAG); } + if (isPALIGNRMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, + getShufflePALIGNRImmediate(SVOp), + DAG); + // Check if this can be converted into a logical shift. bool isLeft = false; unsigned ShAmt = 0; @@ -6985,18 +7460,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true))) + if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. - if (isPALIGNRMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, - getShufflePALIGNRImmediate(SVOp), - DAG); - if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -7013,7 +7483,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT, HasFp256)) + if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); @@ -7032,8 +7502,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); // Handle VPERMILPS/D* permutations - if (isVPERMILPMask(M, VT, HasFp256)) { - if (HasInt256 && VT == MVT::v8i32) + if (isVPERMILPMask(M, VT)) { + if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, @@ -7049,21 +7519,28 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (BlendOp.getNode()) return BlendOp; - if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) { - SmallVector<SDValue, 8> permclMask; - for (unsigned i = 0; i != 8; ++i) { - permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32)); + unsigned Imm8; + if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) + return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); + + if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || + VT.is512BitVector()) { + MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); + MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); + SmallVector<SDValue, 16> permclMask; + for (unsigned i = 0; i != NumElems; ++i) { + permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); } - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, - &permclMask[0], 8); - // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 - return DAG.getNode(X86ISD::VPERMV, dl, VT, - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); - } - if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64)) - return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, - getShuffleCLImmediate(SVOp), DAG); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, + &permclMask[0], NumElems); + if (V2IsUndef) + // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 + return DAG.getNode(X86ISD::VPERMV, dl, VT, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); + return DAG.getNode(X86ISD::VPERMV3, dl, VT, + DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2); + } //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, @@ -7079,7 +7556,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } if (VT == MVT::v16i8) { - SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); + SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } @@ -7103,10 +7580,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); - if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector()) + if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { @@ -7167,25 +7644,45 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - if (!isa<ConstantSDNode>(Op.getOperand(1))) - return SDValue(); - + SDLoc dl(Op); SDValue Vec = Op.getOperand(0); - MVT VecVT = Vec.getValueType().getSimpleVT(); + MVT VecVT = Vec.getSimpleValueType(); + SDValue Idx = Op.getOperand(1); + if (!isa<ConstantSDNode>(Idx)) { + if (VecVT.is512BitVector() || + (VecVT.is256BitVector() && Subtarget->hasInt256() && + VecVT.getVectorElementType().getSizeInBits() == 32)) { + + MVT MaskEltVT = + MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); + MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / + MaskEltVT.getSizeInBits()); + + Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); + SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, + getZeroVector(MaskVT, Subtarget, DAG, dl), + Idx, DAG.getConstant(0, getPointerTy())); + SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), + Perm, DAG.getConstant(0, getPointerTy())); + } + return SDValue(); + } // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. - if (VecVT.is256BitVector()) { - DebugLoc dl = Op.getNode()->getDebugLoc(); - unsigned NumElems = VecVT.getVectorNumElements(); - SDValue Idx = Op.getOperand(1); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + if (VecVT.is256BitVector() || VecVT.is512BitVector()) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); // Get the 128-bit vector. Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); + MVT EltVT = VecVT.getVectorElementType(); - if (IdxVal >= NumElems/2) - IdxVal -= NumElems/2; + unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); + + //if (IdxVal >= NumElems/2) + // IdxVal -= NumElems/2; + IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, MVT::i32)); } @@ -7198,8 +7695,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return Res; } - MVT VT = Op.getValueType().getSimpleVT(); - DebugLoc dl = Op.getDebugLoc(); + MVT VT = Op.getSimpleValueType(); // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); @@ -7226,7 +7722,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // SHUFPS the element to the lowest double word, then movss. int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; - MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); + MVT VVT = Op.getOperand(0).getSimpleValueType(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, @@ -7245,7 +7741,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; - MVT VVT = Op.getOperand(0).getValueType().getSimpleVT(); + MVT VVT = Op.getOperand(0).getSimpleValueType(); SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, @@ -7256,9 +7752,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, } static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); @@ -7310,29 +7806,30 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); // If this is a 256-bit vector result, first extract the 128-bit vector, // insert the element into the extracted half and then place it back. - if (VT.is256BitVector()) { + if (VT.is256BitVector() || VT.is512BitVector()) { if (!isa<ConstantSDNode>(N2)) return SDValue(); // Get the desired 128-bit vector half. - unsigned NumElems = VT.getVectorNumElements(); unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. - bool Upper = IdxVal >= NumElems/2; + unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); + unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, - DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); + DAG.getConstant(IdxIn128, MVT::i32)); // Insert the changed part back to the 256-bit vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); @@ -7357,17 +7854,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { - LLVMContext *Context = DAG.getContext(); - DebugLoc dl = Op.getDebugLoc(); - MVT OpVT = Op.getValueType().getSimpleVT(); + SDLoc dl(Op); + MVT OpVT = Op.getSimpleValueType(); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. - EVT VT128 = EVT::getVectorVT(*Context, - OpVT.getVectorElementType(), - OpVT.getVectorNumElements() / 2); + unsigned SizeFactor = OpVT.getSizeInBits()/128; + MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), + OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); @@ -7390,16 +7886,22 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // upper bits of a vector. static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (Subtarget->hasFp256()) { - DebugLoc dl = Op.getNode()->getDebugLoc(); - SDValue Vec = Op.getNode()->getOperand(0); - SDValue Idx = Op.getNode()->getOperand(1); + SDLoc dl(Op); + SDValue In = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + MVT ResVT = Op.getSimpleValueType(); + MVT InVT = In.getSimpleValueType(); - if (Op.getNode()->getValueType(0).is128BitVector() && - Vec.getNode()->getValueType(0).is256BitVector() && + if (Subtarget->hasFp256()) { + if (ResVT.is128BitVector() && + (InVT.is256BitVector() || InVT.is512BitVector()) && isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - return Extract128BitVector(Vec, IdxVal, DAG, dl); + return Extract128BitVector(In, IdxVal, DAG, dl); + } + if (ResVT.is256BitVector() && InVT.is512BitVector() && + isa<ConstantSDNode>(Idx)) { + return Extract256BitVector(In, IdxVal, DAG, dl); } } return SDValue(); @@ -7411,17 +7913,25 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (Subtarget->hasFp256()) { - DebugLoc dl = Op.getNode()->getDebugLoc(); + SDLoc dl(Op.getNode()); SDValue Vec = Op.getNode()->getOperand(0); SDValue SubVec = Op.getNode()->getOperand(1); SDValue Idx = Op.getNode()->getOperand(2); - if (Op.getNode()->getValueType(0).is256BitVector() && - SubVec.getNode()->getValueType(0).is128BitVector() && + if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || + Op.getNode()->getSimpleValueType(0).is512BitVector()) && + SubVec.getNode()->getSimpleValueType(0).is128BitVector() && isa<ConstantSDNode>(Idx)) { unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); } + + if (Op.getNode()->getSimpleValueType(0).is512BitVector() && + SubVec.getNode()->getSimpleValueType(0).is256BitVector() && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + } } return SDValue(); } @@ -7453,13 +7963,13 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), CP->getAlignment(), CP->getOffset(), OpFlag); - DebugLoc DL = CP->getDebugLoc(); + SDLoc DL(CP); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), + SDLoc(), getPointerTy()), Result); } @@ -7485,14 +7995,14 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), OpFlag); - DebugLoc DL = JT->getDebugLoc(); + SDLoc DL(JT); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), + SDLoc(), getPointerTy()), Result); return Result; @@ -7523,7 +8033,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); // With PIC, the address is actually $g + Offset. @@ -7531,7 +8041,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { !Subtarget->is64Bit()) { Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), + SDLoc(), getPointerTy()), Result); } @@ -7552,7 +8062,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { CodeModel::Model M = getTargetMachine().getCodeModel(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, OpFlags); @@ -7573,7 +8083,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { } SDValue -X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, +X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, int64_t Offset, SelectionDAG &DAG) const { // Create the TargetGlobalAddress node, folding in the constant // offset if it is legal. @@ -7622,7 +8132,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); - return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); + return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); } static SDValue @@ -7631,7 +8141,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), @@ -7660,10 +8170,10 @@ static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { SDValue InFlag; - DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better + SDLoc dl(GA); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), PtrVT), InFlag); + SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); @@ -7681,7 +8191,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool is64Bit) { - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); // Get the start address of the TLS block for this module. X86MachineFunctionInfo* MFI = DAG.getMachineFunction() @@ -7695,7 +8205,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, } else { SDValue InFlag; SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag); + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSLDM, /*LocalDynamic=*/true); @@ -7720,16 +8230,15 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC) { - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), is64Bit ? 257 : 256)); - SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), - DAG.getIntPtrConstant(0), - MachinePointerInfo(Ptr), - false, false, false, 0); + SDValue ThreadPointer = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), + MachinePointerInfo(Ptr), false, false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is @@ -7751,21 +8260,20 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) - SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, - GA->getValueType(0), - GA->getOffset(), OperandFlags); + SDValue TGA = + DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), + GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, - DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, false, - 0); + MachinePointerInfo::getGOT(), false, false, false, 0); } // The address of the thread local variable is the add of the thread @@ -7813,7 +8321,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { OpFlag = X86II::MO_TLVP_PIC_BASE; else OpFlag = X86II::MO_TLVP; - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); @@ -7823,7 +8331,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), + SDLoc(), getPointerTy()), Offset); // Lowering the machine isd will make sure everything is in the right @@ -7860,7 +8368,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // thread-localness. if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) GV = GA->resolveAliasedGlobal(false); - DebugLoc dl = GA->getDebugLoc(); + SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or @@ -7918,11 +8426,16 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); + // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the + // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away + // during isel. + SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, + DAG.getConstant(VTBits - 1, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, MVT::i8)) : DAG.getConstant(0, VT); @@ -7930,12 +8443,15 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); - Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); } else { Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); - Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); + Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); } + // If the shift amount is larger or equal than the width of a part we can't + // rely on the results of shld/shrd. Insert a test and select the appropriate + // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, MVT::i8)); SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, @@ -7977,7 +8493,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); @@ -7993,7 +8509,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) @@ -8068,11 +8584,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, #endif */ - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); // Build some magic constants. - const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); @@ -8122,7 +8638,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), MVT::f64); @@ -8170,7 +8686,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); EVT SVT = N0.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || SVT == MVT::v8i8 || SVT == MVT::v8i16) && @@ -8185,7 +8701,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); if (Op.getValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); @@ -8244,7 +8760,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. - SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), + SDValue SignSet = DAG.getSetCC(dl, + getSetCCResultType(*DAG.getContext(), MVT::i64), Op.getOperand(0), DAG.getConstant(0, MVT::i64), ISD::SETLT); @@ -8273,7 +8790,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, std::pair<SDValue,SDValue> X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); EVT DstTy = Op.getValueType(); @@ -8367,10 +8884,10 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { - MVT VT = Op->getValueType(0).getSimpleVT(); + MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); - MVT InVT = In.getValueType().getSimpleVT(); - DebugLoc dl = Op->getDebugLoc(); + MVT InVT = In.getSimpleValueType(); + SDLoc dl(Op); // Optimize vectors in AVX mode: // @@ -8385,7 +8902,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Concat upper and lower parts. // - if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && + if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && + ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) return SDValue(); @@ -8407,8 +8925,39 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } -SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, + SelectionDAG &DAG) { + MVT VT = Op->getValueType(0).getSimpleVT(); + SDValue In = Op->getOperand(0); + MVT InVT = In.getValueType().getSimpleVT(); + SDLoc DL(Op); + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16) + return SDValue(); + + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + return DAG.getNode(X86ISD::VZEXT, DL, VT, In); + + EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Now we have only mask extension + assert(InVT.getVectorElementType() == MVT::i1); + SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); + const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); + SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + + SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); + if (VT.is512BitVector()) + return Brcst; + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); +} + +static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); if (Res.getNode()) @@ -8417,12 +8966,16 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op, return SDValue(); } -SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); - MVT VT = Op.getValueType().getSimpleVT(); + +static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); - MVT SVT = In.getValueType().getSimpleVT(); + MVT SVT = In.getSimpleValueType(); + + if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) + return LowerZERO_EXTEND_AVX512(Op, DAG); if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); @@ -8430,33 +8983,44 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op, return Res; } - if (!VT.is256BitVector() || !SVT.is128BitVector() || - VT.getVectorNumElements() != SVT.getVectorNumElements()) - return SDValue(); - - assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!"); - - // AVX2 has better support of integer extending. - if (Subtarget->hasInt256()) - return DAG.getNode(X86ISD::VZEXT, DL, VT, In); - - SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In); - static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1}; - SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, - DAG.getVectorShuffle(MVT::v8i16, DL, In, - DAG.getUNDEF(MVT::v8i16), - &Mask[0])); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi); + assert(!VT.is256BitVector() || !SVT.is128BitVector() || + VT.getVectorNumElements() != SVT.getVectorNumElements()); + return SDValue(); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); - MVT VT = Op.getValueType().getSimpleVT(); + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); - MVT SVT = In.getValueType().getSimpleVT(); - - if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) { + MVT InVT = In.getSimpleValueType(); + assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && + "Invalid TRUNCATE operation"); + + if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { + if (VT.getVectorElementType().getSizeInBits() >=8) + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); + unsigned NumElts = InVT.getVectorNumElements(); + assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); + if (InVT.getSizeInBits() < 512) { + MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + } + SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); + const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); + SDValue CP = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); + SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); + return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); + } + + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; @@ -8487,7 +9051,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2); } - if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) { + if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomed PSHUFB. if (Subtarget->hasInt256()) { In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In); @@ -8545,11 +9109,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } // Handle truncation of V256 to V128 using shuffles. - if (!VT.is128BitVector() || !SVT.is256BitVector()) + if (!VT.is128BitVector() || !InVT.is256BitVector()) return SDValue(); - assert(VT.getVectorNumElements() != SVT.getVectorNumElements() && - "Invalid op"); assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); unsigned NumElems = VT.getVectorNumElements(); @@ -8569,11 +9131,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); if (VT.isVector()) { if (VT == MVT::v8i16) - return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT, - DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), + return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT, + DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op), MVT::v8i32, Op.getOperand(0))); return SDValue(); } @@ -8586,7 +9148,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, if (StackSlot.getNode()) // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); @@ -8603,7 +9165,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, if (StackSlot.getNode()) // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); @@ -8612,10 +9174,10 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, } static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { - DebugLoc DL = Op.getDebugLoc(); - MVT VT = Op.getValueType().getSimpleVT(); + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); - MVT SVT = In.getValueType().getSimpleVT(); + MVT SVT = In.getSimpleValueType(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); @@ -8626,8 +9188,8 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); - DebugLoc dl = Op.getDebugLoc(); - MVT VT = Op.getValueType().getSimpleVT(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; if (VT.isVector()) { @@ -8660,8 +9222,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); - DebugLoc dl = Op.getDebugLoc(); - MVT VT = Op.getValueType().getSimpleVT(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; if (VT.isVector()) { @@ -8682,7 +9244,7 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo::getConstantPool(), false, false, false, Alignment); if (VT.isVector()) { - MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::XOR, dl, XORVT, DAG.getNode(ISD::BITCAST, dl, XORVT, @@ -8697,9 +9259,9 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); - MVT VT = Op.getValueType().getSimpleVT(); - MVT SrcVT = Op1.getValueType().getSimpleVT(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT SrcVT = Op1.getSimpleValueType(); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -8774,8 +9336,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); - MVT VT = Op.getValueType().getSimpleVT(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, @@ -8785,8 +9347,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. // -SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); if (!Subtarget->hasSSE41()) @@ -8796,7 +9358,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, return SDValue(); SDNode *N = Op.getNode(); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SmallVector<SDValue, 8> Opnds; DenseMap<SDValue, unsigned> VecInMap; @@ -8808,7 +9370,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, Opnds.push_back(N->getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { - SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; + SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; // BFS traverse all OR'd operands. if (I->getOpcode() == ISD::OR) { Opnds.push_back(I->getOperand(0)); @@ -8880,7 +9442,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // CF and OF aren't always set the way we want. Determine which // of these we need. @@ -8911,7 +9473,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, unsigned NumOperands = 0; // Truncate operations may prevent the merge of the SETCC instruction - // and the arithmetic intruction before it. Attempt to truncate the operands + // and the arithmetic instruction before it. Attempt to truncate the operands // of the arithmetic instruction and use a reduced bit-width instruction. bool NeedTruncation = false; SDValue ArithOp = Op; @@ -9019,7 +9581,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { - SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); + SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); if (EFLAGS.getNode()) return EFLAGS; } @@ -9095,7 +9657,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if (C->getAPIntValue() == 0) return EmitTest(Op0, X86CC, DAG); - DebugLoc dl = Op0.getDebugLoc(); + SDLoc dl(Op0); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { // Use SUB instead of CMP to enable CSE between SUB and CMP. @@ -9122,7 +9684,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) - DebugLoc dl = Cmp.getDebugLoc(); + SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, @@ -9139,7 +9701,7 @@ static bool isAllOnes(SDValue V) { /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, - DebugLoc dl, SelectionDAG &DAG) const { + SDLoc dl, SelectionDAG &DAG) const { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) @@ -9207,16 +9769,61 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, return SDValue(); } +/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point +/// mask CMPs. +static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, + SDValue &Op1) { + unsigned SSECC; + bool Swap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETOEQ: + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETOGT: + case ISD::SETGT: Swap = true; // Fallthrough + case ISD::SETLT: + case ISD::SETOLT: SSECC = 1; break; + case ISD::SETOGE: + case ISD::SETGE: Swap = true; // Fallthrough + case ISD::SETLE: + case ISD::SETOLE: SSECC = 2; break; + case ISD::SETUO: SSECC = 3; break; + case ISD::SETUNE: + case ISD::SETNE: SSECC = 4; break; + case ISD::SETULE: Swap = true; // Fallthrough + case ISD::SETUGE: SSECC = 5; break; + case ISD::SETULT: Swap = true; // Fallthrough + case ISD::SETUGT: SSECC = 6; break; + case ISD::SETO: SSECC = 7; break; + case ISD::SETUEQ: + case ISD::SETONE: SSECC = 8; break; + } + if (Swap) + std::swap(Op0, Op1); + + return SSECC; +} + // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 // ones, and then concatenate the result back. static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue CC = Op.getOperand(2); // Extract the LHS vectors @@ -9237,61 +9844,62 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } +static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + + assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && + Op.getValueType().getScalarType() == MVT::i1 && + "Cannot set masked compare for this operation"); + + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + SDLoc dl(Op); + + bool Unsigned = false; + unsigned SSECC; + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: SSECC = 4; break; + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETUGT: Unsigned = true; + case ISD::SETGT: SSECC = 6; break; // NLE + case ISD::SETULT: Unsigned = true; + case ISD::SETLT: SSECC = 1; break; + case ISD::SETUGE: Unsigned = true; + case ISD::SETGE: SSECC = 5; break; // NLT + case ISD::SETULE: Unsigned = true; + case ISD::SETLE: SSECC = 2; break; + } + unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, MVT::i8)); + +} + static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - SDValue Cond; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); - bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint(); - DebugLoc dl = Op.getDebugLoc(); + bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); + SDLoc dl(Op); if (isFP) { #ifndef NDEBUG - MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT(); + MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif - unsigned SSECC; - bool Swap = false; - - // SSE Condition code mapping: - // 0 - EQ - // 1 - LT - // 2 - LE - // 3 - UNORD - // 4 - NEQ - // 5 - NLT - // 6 - NLE - // 7 - ORD - switch (SetCCOpcode) { - default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETOEQ: - case ISD::SETEQ: SSECC = 0; break; - case ISD::SETOGT: - case ISD::SETGT: Swap = true; // Fallthrough - case ISD::SETLT: - case ISD::SETOLT: SSECC = 1; break; - case ISD::SETOGE: - case ISD::SETGE: Swap = true; // Fallthrough - case ISD::SETLE: - case ISD::SETOLE: SSECC = 2; break; - case ISD::SETUO: SSECC = 3; break; - case ISD::SETUNE: - case ISD::SETNE: SSECC = 4; break; - case ISD::SETULE: Swap = true; // Fallthrough - case ISD::SETUGE: SSECC = 5; break; - case ISD::SETULT: Swap = true; // Fallthrough - case ISD::SETUGT: SSECC = 6; break; - case ISD::SETO: SSECC = 7; break; - case ISD::SETUEQ: - case ISD::SETONE: SSECC = 8; break; - } - if (Swap) - std::swap(Op0, Op1); - + unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); + unsigned Opc = X86ISD::CMPP; + if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { + assert(VT.getVectorNumElements() <= 16); + Opc = X86ISD::CMPM; + } // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { unsigned CC0, CC1; @@ -9303,14 +9911,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; } - SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, MVT::i8)); - SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, MVT::i8)); return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } // Handle all other FP comparisons here. - return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); } @@ -9318,25 +9926,63 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); + bool MaskResult = (VT.getVectorElementType() == MVT::i1); + EVT OpVT = Op1.getValueType(); + if (Subtarget->hasAVX512()) { + if (Op1.getValueType().is512BitVector() || + (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) + return LowerIntVSETCC_AVX512(Op, DAG); + + // In AVX-512 architecture setcc returns mask with i1 elements, + // But there is no compare instruction for i8 and i16 elements. + // We are not talking about 512-bit operands in this case, these + // types are illegal. + if (MaskResult && + (OpVT.getVectorElementType().getSizeInBits() < 32 && + OpVT.getVectorElementType().getSizeInBits() >= 8)) + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); + } + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc; - bool Swap = false, Invert = false, FlipSigns = false; - + bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; + switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; - case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; + case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; - case ISD::SETGT: Opc = X86ISD::PCMPGT; break; + case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; - case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; + case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + Invert = true; break; case ISD::SETULT: Swap = true; - case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; + case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + FlipSigns = true; break; case ISD::SETUGE: Swap = true; - case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; + case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + FlipSigns = true; Invert = true; break; + } + + // Special case: Use min/max operations for SETULE/SETUGE + MVT VET = VT.getVectorElementType(); + bool hasMinMax = + (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) + || (Subtarget->hasSSE2() && (VET == MVT::i8)); + + if (hasMinMax) { + switch (SetCCOpcode) { + default: break; + case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break; + case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break; + } + + if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } } + if (Swap) std::swap(Op0, Op1); @@ -9370,8 +10016,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. - const int MaskHi[] = { 1, 1, 3, 3 }; - const int MaskLo[] = { 0, 0, 2, 2 }; + static const int MaskHi[] = { 1, 1, 3, 3 }; + static const int MaskLo[] = { 0, 0, 2, 2 }; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); @@ -9398,7 +10044,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. - const int Mask[] = { 1, 0, 3, 2 }; + static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); @@ -9423,20 +10069,23 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); + + if (MinMax) + Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); return Result; } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); // Optimize to BT if possible. @@ -9473,7 +10122,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } } - bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint(); + bool isFP = Op1.getSimpleValueType().isFloatingPoint(); unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) return SDValue(); @@ -9530,9 +10179,31 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); + EVT VT = Op1.getValueType(); SDValue CC; + // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available. Otherwise fp cmovs get lowered into a less efficient branch + // sequence later on. + if (Cond.getOpcode() == ISD::SETCC && + ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || + (Subtarget->hasSSE1() && VT == MVT::f32)) && + VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { + SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); + int SSECC = translateX86FSETCC( + cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); + + if (SSECC != 8) { + unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd; + SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1, + DAG.getConstant(SSECC, MVT::i8)); + SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); + SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); + return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); + } + } + if (Cond.getOpcode() == ISD::SETCC) { SDValue NewCond = LowerSETCC(Cond, DAG); if (NewCond.getNode()) @@ -9606,7 +10277,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = Cond.getOperand(1); unsigned Opc = Cmp.getOpcode(); - MVT VT = Op.getValueType().getSimpleVT(); + MVT VT = Op.getSimpleValueType(); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && @@ -9715,15 +10386,50 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); } -SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - MVT VT = Op->getValueType(0).getSimpleVT(); +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); - MVT InVT = In.getValueType().getSimpleVT(); - DebugLoc dl = Op->getDebugLoc(); + MVT InVT = In.getSimpleValueType(); + SDLoc dl(Op); + + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16) + return SDValue(); + + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); + + MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; + Constant *C = ConstantInt::get(*DAG.getContext(), + APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); + + SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); + if (VT.is512BitVector()) + return Brcst; + return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); +} + +static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op->getSimpleValueType(0); + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + SDLoc dl(Op); + + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) + return LowerSIGN_EXTEND_AVX512(Op, DAG); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && - (VT != MVT::v8i32 || InVT != MVT::v8i16)) + (VT != MVT::v8i32 || InVT != MVT::v8i16) && + (VT != MVT::v16i16 || InVT != MVT::v16i8)) return SDValue(); if (Subtarget->hasInt256()) @@ -9793,7 +10499,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue CC; bool Inverted = false; @@ -10063,12 +10769,13 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, "This should be used only on Windows targets or when segmented stacks " "are being used"); assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - // FIXME: Ensure alignment here + unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; @@ -10106,12 +10813,20 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - Flag = Chain.getValue(1); - Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), - SPTy).getValue(1); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + unsigned SPReg = RegInfo->getStackRegister(); + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); + Chain = SP.getValue(1); + + if (Align) { + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); + } - SDValue Ops1[2] = { Chain.getValue(0), Chain }; + SDValue Ops1[2] = { SP, Chain }; return DAG.getMergeValues(Ops1, 2, dl); } } @@ -10121,7 +10836,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { // vastart just stores the address of the VarArgsFrameIndex slot into the @@ -10188,7 +10903,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); unsigned Align = Op.getConstantOperandVal(3); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); @@ -10254,7 +10969,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, @@ -10262,25 +10977,37 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } +// getTargetVShiftByConstNode - Handle vector element shifts where the shift +// amount is a constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT, + SDValue SrcOp, uint64_t ShiftAmt, + SelectionDAG &DAG) { + + // Check for ShiftAmt >= element width + if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) { + if (Opc == X86ISD::VSRAI) + ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1; + else + return DAG.getConstant(0, VT); + } + + assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) + && "Unknown target vector shift-by-constant node"); + + return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); +} + // getTargetVShiftNode - Handle vector element shifts where the shift amount // may or may not be a constant. Takes immediate version of shift as input. -static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, +static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); - if (isa<ConstantSDNode>(ShAmt)) { - // Constant may be a TargetConstant. Use a regular constant. - uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue(); - switch (Opc) { - default: llvm_unreachable("Unknown target vector shift node"); - case X86ISD::VSHLI: - case X86ISD::VSRLI: - case X86ISD::VSRAI: - return DAG.getNode(Opc, dl, VT, SrcOp, - DAG.getConstant(ShiftAmt, MVT::i32)); - } - } + // Catch shift-by-constant. + if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) + return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, + CShAmt->getZExtValue(), DAG); // Change opcode to non-immediate version switch (Opc) { @@ -10308,7 +11035,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. @@ -10483,24 +11210,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxu_b: case Intrinsic::x86_avx2_pmaxu_w: case Intrinsic::x86_avx2_pmaxu_d: + case Intrinsic::x86_avx512_pmaxu_d: + case Intrinsic::x86_avx512_pmaxu_q: case Intrinsic::x86_sse2_pminu_b: case Intrinsic::x86_sse41_pminuw: case Intrinsic::x86_sse41_pminud: case Intrinsic::x86_avx2_pminu_b: case Intrinsic::x86_avx2_pminu_w: case Intrinsic::x86_avx2_pminu_d: + case Intrinsic::x86_avx512_pminu_d: + case Intrinsic::x86_avx512_pminu_q: case Intrinsic::x86_sse41_pmaxsb: case Intrinsic::x86_sse2_pmaxs_w: case Intrinsic::x86_sse41_pmaxsd: case Intrinsic::x86_avx2_pmaxs_b: case Intrinsic::x86_avx2_pmaxs_w: case Intrinsic::x86_avx2_pmaxs_d: + case Intrinsic::x86_avx512_pmaxs_d: + case Intrinsic::x86_avx512_pmaxs_q: case Intrinsic::x86_sse41_pminsb: case Intrinsic::x86_sse2_pmins_w: case Intrinsic::x86_sse41_pminsd: case Intrinsic::x86_avx2_pmins_b: case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: { + case Intrinsic::x86_avx2_pmins_d: + case Intrinsic::x86_avx512_pmins_d: + case Intrinsic::x86_avx512_pmins_q: { unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -10510,6 +11245,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxu_b: case Intrinsic::x86_avx2_pmaxu_w: case Intrinsic::x86_avx2_pmaxu_d: + case Intrinsic::x86_avx512_pmaxu_d: + case Intrinsic::x86_avx512_pmaxu_q: Opcode = X86ISD::UMAX; break; case Intrinsic::x86_sse2_pminu_b: @@ -10518,6 +11255,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pminu_b: case Intrinsic::x86_avx2_pminu_w: case Intrinsic::x86_avx2_pminu_d: + case Intrinsic::x86_avx512_pminu_d: + case Intrinsic::x86_avx512_pminu_q: Opcode = X86ISD::UMIN; break; case Intrinsic::x86_sse41_pmaxsb: @@ -10526,6 +11265,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmaxs_b: case Intrinsic::x86_avx2_pmaxs_w: case Intrinsic::x86_avx2_pmaxs_d: + case Intrinsic::x86_avx512_pmaxs_d: + case Intrinsic::x86_avx512_pmaxs_q: Opcode = X86ISD::SMAX; break; case Intrinsic::x86_sse41_pminsb: @@ -10534,6 +11275,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_pmins_b: case Intrinsic::x86_avx2_pmins_w: case Intrinsic::x86_avx2_pmins_d: + case Intrinsic::x86_avx512_pmins_d: + case Intrinsic::x86_avx512_pmins_q: Opcode = X86ISD::SMIN; break; } @@ -10546,10 +11289,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_sse2_max_pd: case Intrinsic::x86_avx_max_ps_256: case Intrinsic::x86_avx_max_pd_256: + case Intrinsic::x86_avx512_max_ps_512: + case Intrinsic::x86_avx512_max_pd_512: case Intrinsic::x86_sse_min_ps: case Intrinsic::x86_sse2_min_pd: case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: { + case Intrinsic::x86_avx_min_pd_256: + case Intrinsic::x86_avx512_min_ps_512: + case Intrinsic::x86_avx512_min_pd_512: { unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -10557,12 +11304,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_sse2_max_pd: case Intrinsic::x86_avx_max_ps_256: case Intrinsic::x86_avx_max_pd_256: + case Intrinsic::x86_avx512_max_ps_512: + case Intrinsic::x86_avx512_max_pd_512: Opcode = X86ISD::FMAX; break; case Intrinsic::x86_sse_min_ps: case Intrinsic::x86_sse2_min_pd: case Intrinsic::x86_avx_min_ps_256: case Intrinsic::x86_avx_min_pd_256: + case Intrinsic::x86_avx512_min_ps_512: + case Intrinsic::x86_avx512_min_pd_512: Opcode = X86ISD::FMIN; break; } @@ -10633,7 +11384,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, - // but second operand for node/intruction. + // but second operand for node/instruction. return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(1)); @@ -10708,6 +11459,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case Intrinsic::x86_avx512_kortestz: + case Intrinsic::x86_avx512_kortestc: { + unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B; + SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); + SDValue CC = DAG.getConstant(X86CC, MVT::i8); + SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } // SSE/AVX shift intrinsics case Intrinsic::x86_sse2_psll_w: @@ -10858,8 +11619,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { X86CC = X86::COND_E; break; } - SmallVector<SDValue, 5> NewOps; - NewOps.append(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, @@ -10876,8 +11636,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { else Opcode = X86ISD::PCMPESTRI; - SmallVector<SDValue, 5> NewOps; - NewOps.append(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); } @@ -10904,7 +11663,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmaddsub_ps_256: case Intrinsic::x86_fma_vfmaddsub_pd_256: case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: { + case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_vfmadd_ps_512: + case Intrinsic::x86_fma_vfmadd_pd_512: + case Intrinsic::x86_fma_vfmsub_ps_512: + case Intrinsic::x86_fma_vfmsub_pd_512: + case Intrinsic::x86_fma_vfnmadd_ps_512: + case Intrinsic::x86_fma_vfnmadd_pd_512: + case Intrinsic::x86_fma_vfnmsub_ps_512: + case Intrinsic::x86_fma_vfnmsub_pd_512: + case Intrinsic::x86_fma_vfmaddsub_ps_512: + case Intrinsic::x86_fma_vfmaddsub_pd_512: + case Intrinsic::x86_fma_vfmsubadd_ps_512: + case Intrinsic::x86_fma_vfmsubadd_pd_512: { unsigned Opc; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -10912,36 +11683,48 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmadd_pd: case Intrinsic::x86_fma_vfmadd_ps_256: case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_vfmadd_ps_512: + case Intrinsic::x86_fma_vfmadd_pd_512: Opc = X86ISD::FMADD; break; case Intrinsic::x86_fma_vfmsub_ps: case Intrinsic::x86_fma_vfmsub_pd: case Intrinsic::x86_fma_vfmsub_ps_256: case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_vfmsub_ps_512: + case Intrinsic::x86_fma_vfmsub_pd_512: Opc = X86ISD::FMSUB; break; case Intrinsic::x86_fma_vfnmadd_ps: case Intrinsic::x86_fma_vfnmadd_pd: case Intrinsic::x86_fma_vfnmadd_ps_256: case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_vfnmadd_ps_512: + case Intrinsic::x86_fma_vfnmadd_pd_512: Opc = X86ISD::FNMADD; break; case Intrinsic::x86_fma_vfnmsub_ps: case Intrinsic::x86_fma_vfnmsub_pd: case Intrinsic::x86_fma_vfnmsub_ps_256: case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_vfnmsub_ps_512: + case Intrinsic::x86_fma_vfnmsub_pd_512: Opc = X86ISD::FNMSUB; break; case Intrinsic::x86_fma_vfmaddsub_ps: case Intrinsic::x86_fma_vfmaddsub_pd: case Intrinsic::x86_fma_vfmaddsub_ps_256: case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_vfmaddsub_ps_512: + case Intrinsic::x86_fma_vfmaddsub_pd_512: Opc = X86ISD::FMADDSUB; break; case Intrinsic::x86_fma_vfmsubadd_ps: case Intrinsic::x86_fma_vfmsubadd_pd: case Intrinsic::x86_fma_vfmsubadd_ps_256: case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_vfmsubadd_ps_512: + case Intrinsic::x86_fma_vfmsubadd_pd_512: Opc = X86ISD::FMSUBADD; break; } @@ -10952,8 +11735,88 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { } } -static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); +static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain, + const X86Subtarget * Subtarget) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getConstant(~0, MaskVT); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); +} + +static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget * Subtarget) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + if (Src.getOpcode() == ISD::UNDEF) + Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl); +} + +static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getConstant(~0, MaskVT); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + return SDValue(Res, 1); +} + +static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain) { + SDLoc dl(Op); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); + assert(C && "Invalid scale type"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + EVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getValueType().getVectorNumElements()); + SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + return SDValue(Res, 1); +} + +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. @@ -10987,7 +11850,144 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } - + //int_gather(index, base, scale); + case Intrinsic::x86_avx512_gather_qpd_512: + case Intrinsic::x86_avx512_gather_qps_512: + case Intrinsic::x86_avx512_gather_dpd_512: + case Intrinsic::x86_avx512_gather_qpi_512: + case Intrinsic::x86_avx512_gather_qpq_512: + case Intrinsic::x86_avx512_gather_dpq_512: + case Intrinsic::x86_avx512_gather_dps_512: + case Intrinsic::x86_avx512_gather_dpi_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break; + case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break; + case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break; + case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break; + case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break; + case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break; + case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break; + case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Index = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Scale = Op.getOperand(4); + return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget); + } + //int_gather_mask(v1, mask, index, base, scale); + case Intrinsic::x86_avx512_gather_qps_mask_512: + case Intrinsic::x86_avx512_gather_qpd_mask_512: + case Intrinsic::x86_avx512_gather_dpd_mask_512: + case Intrinsic::x86_avx512_gather_dps_mask_512: + case Intrinsic::x86_avx512_gather_qpi_mask_512: + case Intrinsic::x86_avx512_gather_qpq_mask_512: + case Intrinsic::x86_avx512_gather_dpi_mask_512: + case Intrinsic::x86_avx512_gather_dpq_mask_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_gather_qps_mask_512: + Opc = X86::VGATHERQPSZrm; break; + case Intrinsic::x86_avx512_gather_qpd_mask_512: + Opc = X86::VGATHERQPDZrm; break; + case Intrinsic::x86_avx512_gather_dpd_mask_512: + Opc = X86::VGATHERDPDZrm; break; + case Intrinsic::x86_avx512_gather_dps_mask_512: + Opc = X86::VGATHERDPSZrm; break; + case Intrinsic::x86_avx512_gather_qpi_mask_512: + Opc = X86::VPGATHERQDZrm; break; + case Intrinsic::x86_avx512_gather_qpq_mask_512: + Opc = X86::VPGATHERQQZrm; break; + case Intrinsic::x86_avx512_gather_dpi_mask_512: + Opc = X86::VPGATHERDDZrm; break; + case Intrinsic::x86_avx512_gather_dpq_mask_512: + Opc = X86::VPGATHERDQZrm; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Src = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Base = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain, + Subtarget); + } + //int_scatter(base, index, v1, scale); + case Intrinsic::x86_avx512_scatter_qpd_512: + case Intrinsic::x86_avx512_scatter_qps_512: + case Intrinsic::x86_avx512_scatter_dpd_512: + case Intrinsic::x86_avx512_scatter_qpi_512: + case Intrinsic::x86_avx512_scatter_qpq_512: + case Intrinsic::x86_avx512_scatter_dpq_512: + case Intrinsic::x86_avx512_scatter_dps_512: + case Intrinsic::x86_avx512_scatter_dpi_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_scatter_qpd_512: + Opc = X86::VSCATTERQPDZmr; break; + case Intrinsic::x86_avx512_scatter_qps_512: + Opc = X86::VSCATTERQPSZmr; break; + case Intrinsic::x86_avx512_scatter_dpd_512: + Opc = X86::VSCATTERDPDZmr; break; + case Intrinsic::x86_avx512_scatter_dps_512: + Opc = X86::VSCATTERDPSZmr; break; + case Intrinsic::x86_avx512_scatter_qpi_512: + Opc = X86::VPSCATTERQDZmr; break; + case Intrinsic::x86_avx512_scatter_qpq_512: + Opc = X86::VPSCATTERQQZmr; break; + case Intrinsic::x86_avx512_scatter_dpq_512: + Opc = X86::VPSCATTERDQZmr; break; + case Intrinsic::x86_avx512_scatter_dpi_512: + Opc = X86::VPSCATTERDDZmr; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Base = Op.getOperand(2); + SDValue Index = Op.getOperand(3); + SDValue Src = Op.getOperand(4); + SDValue Scale = Op.getOperand(5); + return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain); + } + //int_scatter_mask(base, mask, index, v1, scale); + case Intrinsic::x86_avx512_scatter_qps_mask_512: + case Intrinsic::x86_avx512_scatter_qpd_mask_512: + case Intrinsic::x86_avx512_scatter_dpd_mask_512: + case Intrinsic::x86_avx512_scatter_dps_mask_512: + case Intrinsic::x86_avx512_scatter_qpi_mask_512: + case Intrinsic::x86_avx512_scatter_qpq_mask_512: + case Intrinsic::x86_avx512_scatter_dpi_mask_512: + case Intrinsic::x86_avx512_scatter_dpq_mask_512: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx512_scatter_qpd_mask_512: + Opc = X86::VSCATTERQPDZmr; break; + case Intrinsic::x86_avx512_scatter_qps_mask_512: + Opc = X86::VSCATTERQPSZmr; break; + case Intrinsic::x86_avx512_scatter_dpd_mask_512: + Opc = X86::VSCATTERDPDZmr; break; + case Intrinsic::x86_avx512_scatter_dps_mask_512: + Opc = X86::VSCATTERDPSZmr; break; + case Intrinsic::x86_avx512_scatter_qpi_mask_512: + Opc = X86::VPSCATTERQDZmr; break; + case Intrinsic::x86_avx512_scatter_qpq_mask_512: + Opc = X86::VPSCATTERQQZmr; break; + case Intrinsic::x86_avx512_scatter_dpq_mask_512: + Opc = X86::VPSCATTERDQZmr; break; + case Intrinsic::x86_avx512_scatter_dpi_mask_512: + Opc = X86::VPSCATTERDDZmr; break; + } + SDValue Chain = Op.getOperand(0); + SDValue Base = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Src = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + } // XTEST intrinsics. case Intrinsic::x86_xtest: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); @@ -11008,13 +12008,14 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, MFI->setReturnAddressIsTaken(true); unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT PtrVT = getPointerTy(); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - SDValue Offset = - DAG.getConstant(RegInfo->getSlotSize(), PtrVT); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); + SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), @@ -11032,8 +12033,10 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MFI->setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful + SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && @@ -11048,6 +12051,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } @@ -11055,9 +12060,11 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl (Op); EVT PtrVT = getPointerTy(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -11078,7 +12085,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); @@ -11086,7 +12093,7 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } @@ -11101,7 +12108,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); @@ -11271,7 +12278,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, const TargetFrameLowering &TFI = *TM.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); EVT VT = Op.getValueType(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); @@ -11318,7 +12325,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); Op = Op.getOperand(0); if (VT == MVT::i8) { @@ -11352,7 +12359,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); Op = Op.getOperand(0); if (VT == MVT::i8) { @@ -11376,7 +12383,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); unsigned NumBits = VT.getSizeInBits(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); Op = Op.getOperand(0); // Issue a bsf (scan bits forward) which also sets EFLAGS. @@ -11402,7 +12409,7 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { "Unsupported value type for operation"); unsigned NumElems = VT.getVectorNumElements(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); @@ -11438,7 +12445,7 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. @@ -11454,7 +12461,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, "Should not custom lower when pmuldq is available!"); // Extract the odd parts. - const int UnpackMask[] = { 1, -1, 3, -1 }; + static const int UnpackMask[] = { 1, -1, 3, -1 }; SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); @@ -11468,12 +12475,12 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. - const int ShufMask[] = { 0, 4, 2, 6 }; + static const int ShufMask[] = { 0, 4, 2, 6 }; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } - assert((VT == MVT::v2i64 || VT == MVT::v4i64) && - "Only know how to lower V2I64/V4I64 multiply"); + assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && + "Only know how to lower V2I64/V4I64/V8I64 multiply"); // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); @@ -11486,13 +12493,12 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // AhiBlo = psllqi(AhiBlo, 32); // return AloBlo + AloBhi + AhiBlo; - SDValue ShAmt = DAG.getConstant(32, MVT::i32); - - SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); - SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); + SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); + SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); // Bit cast to 32-bit vectors for MULUDQ - EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; + EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : + (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); @@ -11502,19 +12508,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); - AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); - AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); + AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); + AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } -SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT EltTy = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); // Lower sdiv X, pow2-const. BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1)); @@ -11522,23 +12528,35 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); APInt SplatValue, SplatUndef; - unsigned MinSplatBits; + unsigned SplatBitSize; bool HasAnyUndefs; - if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs)) + if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, + HasAnyUndefs) || + EltTy.getSizeInBits() < SplatBitSize) return SDValue(); if ((SplatValue != 0) && (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { - unsigned lg2 = SplatValue.countTrailingZeros(); + unsigned Lg2 = SplatValue.countTrailingZeros(); // Splat the sign bit. - SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32); - SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG); + SmallVector<SDValue, 16> Sz(NumElts, + DAG.getConstant(EltTy.getSizeInBits() - 1, + EltTy)); + SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0], + NumElts)); // Add (N0 < 0) ? abs2 - 1 : 0; - SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32); - SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG); + SmallVector<SDValue, 16> Amt(NumElts, + DAG.getConstant(EltTy.getSizeInBits() - Lg2, + EltTy)); + SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0], + NumElts)); SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); - SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32); - SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG); + SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy)); + SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0], + NumElts)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. @@ -11555,7 +12573,7 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); @@ -11567,23 +12585,26 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && - (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { + (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || + (Subtarget->hasAVX512() && + (VT == MVT::v8i64 || VT == MVT::v16i32))) { if (Op.getOpcode() == ISD::SHL) - return DAG.getNode(X86ISD::VSHLI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, + DAG); if (Op.getOpcode() == ISD::SRL) - return DAG.getNode(X86ISD::VSRLI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, + DAG); if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) - return DAG.getNode(X86ISD::VSRAI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, + DAG); } if (VT == MVT::v16i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. - SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, + MVT::v8i16, R, ShiftAmt, + DAG); SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. SmallVector<SDValue, 16> V(16, @@ -11594,8 +12615,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. - SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, + MVT::v8i16, R, ShiftAmt, + DAG); SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. SmallVector<SDValue, 16> V(16, @@ -11626,8 +12648,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (Subtarget->hasInt256() && VT == MVT::v32i8) { if (Op.getOpcode() == ISD::SHL) { // Make a large shift. - SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, + MVT::v16i16, R, ShiftAmt, + DAG); SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. SmallVector<SDValue, 32> V(32, @@ -11638,8 +12661,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. - SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, + MVT::v16i16, R, ShiftAmt, + DAG); SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. SmallVector<SDValue, 32> V(32, @@ -11704,14 +12728,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, default: llvm_unreachable("Unknown shift opcode!"); case ISD::SHL: - return DAG.getNode(X86ISD::VSHLI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, + DAG); case ISD::SRL: - return DAG.getNode(X86ISD::VSRLI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, + DAG); case ISD::SRA: - return DAG.getNode(X86ISD::VSRAI, dl, VT, R, - DAG.getConstant(ShiftAmt, MVT::i32)); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, + DAG); } } @@ -11721,7 +12745,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget* Subtarget) { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); @@ -11729,7 +12753,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, VT == MVT::v4i32 || VT == MVT::v8i16 || (Subtarget->hasInt256() && ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || - VT == MVT::v8i32 || VT == MVT::v16i16))) { + VT == MVT::v8i32 || VT == MVT::v16i16)) || + (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { SDValue BaseShAmt; EVT EltVT = VT.getVectorElementType(); @@ -11797,6 +12822,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v4i64: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRA: @@ -11806,6 +12833,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v8i16: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); } case ISD::SRL: @@ -11817,6 +12846,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, case MVT::v4i64: case MVT::v8i32: case MVT::v16i16: + case MVT::v16i32: + case MVT::v8i64: return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); } } @@ -11825,7 +12856,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget->is64Bit() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || + (Subtarget->hasAVX512() && VT == MVT::v8i64)) && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); @@ -11854,10 +12886,11 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, return SDValue(); } -SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, + SelectionDAG &DAG) { EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); SDValue V; @@ -11873,6 +12906,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { if (V.getNode()) return V; + if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) + return Op; // AVX2 has VPSLLV/VPSRAV/VPSRLV. if (Subtarget->hasInt256()) { if (Op.getOpcode() == ISD::SRL && @@ -11913,8 +12948,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // r = VSELECT(r, psllw(r & (char16)15, 4), a); SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); - M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, - DAG.getConstant(4, MVT::i32), DAG); + M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); M = DAG.getNode(ISD::BITCAST, dl, VT, M); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); @@ -11925,8 +12959,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // r = VSELECT(r, psllw(r & (char16)63, 2), a); M = DAG.getNode(ISD::AND, dl, VT, R, CM2); - M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, - DAG.getConstant(2, MVT::i32), DAG); + M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); M = DAG.getNode(ISD::BITCAST, dl, VT, M); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); @@ -11993,7 +13026,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue RHS = N->getOperand(1); unsigned BaseOp = 0; unsigned Cond = 0; - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: @@ -12060,7 +13093,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); EVT VT = Op.getValueType(); @@ -12069,7 +13102,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, unsigned BitsDiff = VT.getScalarType().getSizeInBits() - ExtraVT.getScalarType().getSizeInBits(); - SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); switch (VT.getSimpleVT().SimpleTy) { default: return SDValue(); @@ -12103,31 +13135,41 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, // fall through case MVT::v4i32: case MVT::v8i16: { - // (sext (vzext x)) -> (vsext x) SDValue Op0 = Op.getOperand(0); SDValue Op00 = Op0.getOperand(0); SDValue Tmp1; // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. if (Op0.getOpcode() == ISD::BITCAST && - Op00.getOpcode() == ISD::VECTOR_SHUFFLE) - Tmp1 = LowerVectorIntExtend(Op00, DAG); - if (Tmp1.getNode()) { - SDValue Tmp1Op0 = Tmp1.getOperand(0); - assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && - "This optimization is invalid without a VZEXT."); - return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { + // (sext (vzext x)) -> (vsext x) + Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); + if (Tmp1.getNode()) { + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + // This folding is only valid when the in-reg type is a vector of i8, + // i16, or i32. + if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || + ExtraEltVT == MVT::i32) { + SDValue Tmp1Op0 = Tmp1.getOperand(0); + assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && + "This optimization is invalid without a VZEXT."); + return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); + } + Op0 = Tmp1; + } } // If the above didn't work, then just use Shift-Left + Shift-Right. - Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG); - return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); + Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, + DAG); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff, + DAG); } } } static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); SynchronizationScope FenceScope = static_cast<SynchronizationScope>( @@ -12164,7 +13206,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { EVT T = Op.getValueType(); - DebugLoc DL = Op.getDebugLoc(); + SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; switch(T.getSimpleVT().SimpleTy) { @@ -12198,7 +13240,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, assert(Subtarget->is64Bit() && "Result not type legalized?"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, @@ -12212,9 +13254,10 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Ops, array_lengthof(Ops), dl); } -SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { - EVT SrcVT = Op.getOperand(0).getValueType(); - EVT DstVT = Op.getValueType(); +static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + MVT DstVT = Op.getSimpleValueType(); assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || @@ -12234,7 +13277,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); EVT T = Node->getValueType(0); SDValue negOp = DAG.getNode(ISD::SUB, dl, T, DAG.getConstant(0, T), Node->getOperand(2)); @@ -12250,7 +13293,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); // Convert seq_cst store -> xchg @@ -12293,25 +13336,26 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { } if (!ExtraOp) - return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); - return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } -SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). - DebugLoc dl = Op.getDebugLoc(); + SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - ArgListTy Args; - ArgListEntry Entry; + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; @@ -12324,7 +13368,8 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); Type *RetTy = isF64 ? (Type*)StructType::get(ArgTy, ArgTy, NULL) @@ -12335,7 +13380,7 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { CallingConv::C, /*isTaillCall=*/false, /*doesNotRet=*/false, /*isReturnValueUsed*/true, Callee, Args, DAG, dl); - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); if (isF64) // Returned in xmm0 and xmm1. @@ -12379,9 +13424,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); - case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); - case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); - case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG); + case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); + case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); + case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -12397,7 +13442,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: @@ -12415,7 +13461,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: - case ISD::SHL: return LowerShift(Op, DAG); + case ISD::SHL: return LowerShift(Op, Subtarget, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: @@ -12423,7 +13469,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); - case ISD::BITCAST: return LowerBITCAST(Op, DAG); + case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: @@ -12431,14 +13477,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); case ISD::SDIV: return LowerSDIV(Op, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); } } static void ReplaceATOMIC_LOAD(SDNode *Node, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) { - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); // Convert wide load -> cmpxchg8b/cmpxchg16b @@ -12459,7 +13505,7 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, static void ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, SelectionDAG &DAG, unsigned NewOp) { - DebugLoc dl = Node->getDebugLoc(); + SDLoc dl(Node); assert (Node->getValueType(0) == MVT::i64 && "Only know how to expand i64 atomics"); @@ -12484,7 +13530,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, SelectionDAG &DAG) const { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (N->getOpcode()) { default: @@ -12668,6 +13714,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SHLD: return "X86ISD::SHLD"; case X86ISD::SHRD: return "X86ISD::SHRD"; case X86ISD::FAND: return "X86ISD::FAND"; + case X86ISD::FANDN: return "X86ISD::FANDN"; case X86ISD::FOR: return "X86ISD::FOR"; case X86ISD::FXOR: return "X86ISD::FXOR"; case X86ISD::FSRL: return "X86ISD::FSRL"; @@ -12684,6 +13731,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; + case X86ISD::CMPM: return "X86ISD::CMPM"; + case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; @@ -12743,6 +13792,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; + case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; + case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; + case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; @@ -12756,6 +13808,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; + case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; + case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; @@ -12770,9 +13824,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::BLSI: return "X86ISD::BLSI"; case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; case X86ISD::BLSR: return "X86ISD::BLSR"; + case X86ISD::BZHI: return "X86ISD::BZHI"; + case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; + case X86ISD::TESTM: return "X86ISD::TESTM"; + case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; @@ -12791,9 +13850,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; + case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; @@ -12879,6 +13940,20 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { return NumBits1 > NumBits2; } +bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + + if (!isTypeLegal(EVT::getEVT(Ty1))) + return false; + + assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); + + // Assuming the caller doesn't have a zeroext or signext return parameter, + // truncation all the way down to i1 is valid. + return true; +} + bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<32>(Imm); } @@ -12930,6 +14005,27 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool +X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) + return false; + + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); @@ -12942,37 +14038,46 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { bool X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const { + if (!VT.isSimple()) + return false; + + MVT SVT = VT.getSimpleVT(); + // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) return false; // FIXME: pshufb, blends, shifts. - return (VT.getVectorNumElements() == 2 || + return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || - isMOVLMask(M, VT) || - isSHUFPMask(M, VT, Subtarget->hasFp256()) || - isPSHUFDMask(M, VT) || - isPSHUFHWMask(M, VT, Subtarget->hasInt256()) || - isPSHUFLWMask(M, VT, Subtarget->hasInt256()) || - isPALIGNRMask(M, VT, Subtarget) || - isUNPCKLMask(M, VT, Subtarget->hasInt256()) || - isUNPCKHMask(M, VT, Subtarget->hasInt256()) || - isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) || - isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256())); + isMOVLMask(M, SVT) || + isSHUFPMask(M, SVT) || + isPSHUFDMask(M, SVT) || + isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || + isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || + isPALIGNRMask(M, SVT, Subtarget) || + isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || + isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || + isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || + isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256())); } bool X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const { - unsigned NumElts = VT.getVectorNumElements(); + if (!VT.isSimple()) + return false; + + MVT SVT = VT.getSimpleVT(); + unsigned NumElts = SVT.getVectorNumElements(); // FIXME: This collection of masks seems suspect. if (NumElts == 2) return true; - if (NumElts == 4 && VT.is128BitVector()) { - return (isMOVLMask(Mask, VT) || - isCommutedMOVLMask(Mask, VT, true) || - isSHUFPMask(Mask, VT, Subtarget->hasFp256()) || - isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true)); + if (NumElts == 4 && SVT.is128BitVector()) { + return (isMOVLMask(Mask, SVT) || + isCommutedMOVLMask(Mask, SVT, true) || + isSHUFPMask(Mask, SVT) || + isSHUFPMask(Mask, SVT, /* Commuted */ true)); } return false; } @@ -14396,12 +15501,11 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, } else { // __chkstk(MSVCRT): does not update stack pointer. // Clobbers R10, R11 and EFLAGS. - // FIXME: RAX(allocated size) might be reused and not killed. BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) .addExternalSymbol("__chkstk") .addReg(X86::RAX, RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - // RAX has the offset to subtracted from RSP. + // RAX has the offset to be subtracted from RSP. BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(X86::RAX); @@ -14593,6 +15697,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); + + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -14638,6 +15745,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo()); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -14710,6 +15819,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V8F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: @@ -15038,7 +16150,7 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); @@ -15134,7 +16246,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); // Don't create instructions with illegal types after legalize types has run. @@ -15158,7 +16270,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); } /// PerformTruncateCombine - Converts truncate operation to @@ -15253,7 +16365,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); @@ -15265,6 +16377,44 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +/// Extract one bit from mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) { + SDValue Vec = N->getOperand(0); + SDLoc dl(Vec); + MVT VecVT = Vec.getSimpleValueType(); + SDValue Idx = N->getOperand(1); + MVT EltVT = N->getSimpleValueType(0); + + assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) || + "Unexpected operands in ExtractBitFromMaskVector"); + + // variable index + if (!isa<ConstantSDNode>(Idx)) { + MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + ExtVT.getVectorElementType(), Ext); + return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); + } + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits()); + unsigned MaxShift = VecVT.getSizeInBits() - 1; + Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec); + Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, + DAG.getConstant(MaxShift - IdxVal, ScalarVT)); + Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec, + DAG.getConstant(MaxShift, ScalarVT)); + + if (VecVT == MVT::v16i1) { + Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec); + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec); + } + return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// to a simple store and scalar loads to extract the elements. @@ -15275,12 +16425,17 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return NewOp; SDValue InputVector = N->getOperand(0); + + if (InputVector.getValueType().getVectorElementType() == MVT::i1 && + !DCI.isBeforeLegalize()) + return ExtractBitFromMaskVector(N, DAG); + // Detect whether we are trying to convert from mmx to i32 and the bitcast // from mmx to v2i32 has a single usage. if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) - return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(), + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), N->getValueType(0), InputVector.getNode()->getOperand(0)); @@ -15325,7 +16480,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Ok, we've now decided to do the transformation. - DebugLoc dl = InputVector.getDebugLoc(); + SDLoc dl(InputVector); // Store the value to a temporary stack slot. SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); @@ -15362,24 +16517,28 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, } /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. -static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, - SDValue RHS, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { +static std::pair<unsigned, bool> +matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, + SelectionDAG &DAG, const X86Subtarget *Subtarget) { if (!VT.isVector()) - return 0; + return std::make_pair(0, false); + bool NeedSplit = false; switch (VT.getSimpleVT().SimpleTy) { - default: return 0; + default: return std::make_pair(0, false); case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: if (!Subtarget->hasAVX2()) - return 0; + NeedSplit = true; + if (!Subtarget->hasAVX()) + return std::make_pair(0, false); + break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: if (!Subtarget->hasSSE2()) - return 0; + return std::make_pair(0, false); } // SSE2 has only a small subset of the operations. @@ -15390,6 +16549,7 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + unsigned Opc = 0; // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { @@ -15397,16 +16557,16 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, default: break; case ISD::SETULT: case ISD::SETULE: - return hasUnsigned ? X86ISD::UMIN : 0; + Opc = hasUnsigned ? X86ISD::UMIN : 0; break; case ISD::SETUGT: case ISD::SETUGE: - return hasUnsigned ? X86ISD::UMAX : 0; + Opc = hasUnsigned ? X86ISD::UMAX : 0; break; case ISD::SETLT: case ISD::SETLE: - return hasSigned ? X86ISD::SMIN : 0; + Opc = hasSigned ? X86ISD::SMIN : 0; break; case ISD::SETGT: case ISD::SETGE: - return hasSigned ? X86ISD::SMAX : 0; + Opc = hasSigned ? X86ISD::SMAX : 0; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && @@ -15415,20 +16575,20 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, default: break; case ISD::SETULT: case ISD::SETULE: - return hasUnsigned ? X86ISD::UMAX : 0; + Opc = hasUnsigned ? X86ISD::UMAX : 0; break; case ISD::SETUGT: case ISD::SETUGE: - return hasUnsigned ? X86ISD::UMIN : 0; + Opc = hasUnsigned ? X86ISD::UMIN : 0; break; case ISD::SETLT: case ISD::SETLE: - return hasSigned ? X86ISD::SMAX : 0; + Opc = hasSigned ? X86ISD::SMAX : 0; break; case ISD::SETGT: case ISD::SETGE: - return hasSigned ? X86ISD::SMIN : 0; + Opc = hasSigned ? X86ISD::SMIN : 0; break; } } - return 0; + return std::make_pair(Opc, NeedSplit); } /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT @@ -15436,19 +16596,20 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have SSE[12] support, try to form min/max nodes. SSE min/max // instructions match the semantics of the common C idiom x<y?x:y but not // x<=y?x:y, because of how they handle negative zero (which can be // ignored in unsafe-math mode). if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + VT != MVT::f80 && TLI.isTypeLegal(VT) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -15587,6 +16748,22 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } + EVT CondVT = Cond.getValueType(); + if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && + CondVT.getVectorElementType() == MVT::i1) { + // v16i8 (select v16i1, v16i8, v16i8) does not have a proper + // lowering on AVX-512. In this case we convert it to + // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. + // The same situation for all 128 and 256-bit vectors of i8 and i16 + EVT OpVT = LHS.getValueType(); + if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && + (OpVT.getVectorElementType() == MVT::i8 || + OpVT.getVectorElementType() == MVT::i16)) { + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); + DCI.AddToWorklist(Cond.getNode()); + return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); + } + } // If this is a select between two integer constants, try to do some // optimizations. if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { @@ -15704,16 +16881,19 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETLT: case ISD::SETGT: { ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; - Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), + Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); } } } + // Early exit check + if (!TLI.isTypeLegal(VT)) + return SDValue(); + // Match VSELECTs into subs with unsigned saturation. - if (!DCI.isBeforeLegalize() && - N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { @@ -15767,14 +16947,35 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } // Try to match a min/max vector operation. - if (!DCI.isBeforeLegalize() && - N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) - if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) - return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { + std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); + unsigned Opc = ret.first; + bool NeedSplit = ret.second; + + if (Opc && NeedSplit) { + unsigned NumElems = VT.getVectorNumElements(); + // Extract the LHS vectors + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); + + // Extract the RHS vectors + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); + + // Create min/max for each subvector + LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); + RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); + + // Merge the result + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); + } else if (Opc) + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. - if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT && - Cond.getOpcode() == ISD::SETCC) { + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + // Check if SETCC has already been promoted + TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) { assert(Cond.getValueType().isVector() && "vector select expects a vector selector!"); @@ -15821,7 +17022,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits // to simplify previous instructions. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); @@ -15830,6 +17030,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (BitWidth == 1) return SDValue(); + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are set + // properly. + for (SDNode::use_iterator I = Cond->use_begin(), + E = Cond->use_end(); I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -15980,7 +17189,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // If the flag operand isn't dead, don't touch this CMOV. if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) @@ -16183,7 +17392,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, } if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -16233,7 +17442,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { APInt ShAmt = N1C->getAPIntValue(); Mask = Mask.shl(ShAmt); if (Mask != 0) - return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, + return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, DAG.getConstant(Mask, VT)); } } @@ -16249,7 +17458,39 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // hardware support for this operation. This is better expressed as an ADD // of two values. if (N1C && (1 == N1C->getZExtValue())) { - return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); + return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); + } + } + + return SDValue(); +} + +/// \brief Returns a vector of 0s if the node in input is a vector logical +/// shift by a constant amount which is known to be bigger than or equal +/// to the vector element size in bits. +static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + (!Subtarget->hasInt256() || + (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) + return SDValue(); + + SDValue Amt = N->getOperand(1); + SDLoc DL(N); + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + APInt ShiftAmt = C->getAPIntValue(); + unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); + + // SSE2/AVX2 logical shifts always return a vector of 0s + // if the shift amount is bigger than or equal to + // the element size. The constant shift amount will be + // encoded as a 8-bit immediate. + if (ShiftAmt.trunc(8).uge(MaxAmount)) + return getZeroVector(VT, Subtarget, DAG, DL); } } @@ -16265,6 +17506,12 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, if (V.getNode()) return V; } + if (N->getOpcode() != ISD::SRA) { + // Try to fold this logical shift into a zero vector. + SDValue V = performShiftToAllZeros(N, DAG, Subtarget); + if (V.getNode()) return V; + } + return SDValue(); } @@ -16283,7 +17530,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); SDValue CMP1 = N1->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // The SETCCs should both refer to the same CMP. if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) @@ -16402,7 +17649,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, SDValue N0 = Narrow->getOperand(0); SDValue N1 = Narrow->getOperand(1); - DebugLoc DL = Narrow->getDebugLoc(); + SDLoc DL(Narrow); // The Left side has to be a trunc. if (N0.getOpcode() != ISD::TRUNCATE) @@ -16468,33 +17715,80 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (R.getNode()) return R; - // Create BLSI, and BLSR instructions + // Create BLSI, BLSR, and BZHI instructions // BLSI is X & (-X) // BLSR is X & (X-1) - if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { + // BZHI is X & ((1 << Y) - 1) + // BEXTR is ((X >> imm) & (2**size-1)) + if (VT == MVT::i32 || VT == MVT::i64) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); - - // Check LHS for neg - if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && - isZero(N0.getOperand(0))) - return DAG.getNode(X86ISD::BLSI, DL, VT, N1); - - // Check RHS for neg - if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && - isZero(N1.getOperand(0))) - return DAG.getNode(X86ISD::BLSI, DL, VT, N0); + SDLoc DL(N); + + if (Subtarget->hasBMI()) { + // Check LHS for neg + if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && + isZero(N0.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N1); + + // Check RHS for neg + if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && + isZero(N1.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N0); + + // Check LHS for X-1 + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N1); + + // Check RHS for X-1 + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N0); + } + + if (Subtarget->hasBMI2()) { + // Check for (and (add (shl 1, Y), -1), X) + if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == ISD::SHL) { + SDValue N001 = N00.getOperand(1); + assert(N001.getValueType() == MVT::i8 && "unexpected type"); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0)); + if (C && C->getZExtValue() == 1) + return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001); + } + } - // Check LHS for X-1 - if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && - isAllOnes(N0.getOperand(1))) - return DAG.getNode(X86ISD::BLSR, DL, VT, N1); + // Check for (and X, (add (shl 1, Y), -1)) + if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == ISD::SHL) { + SDValue N101 = N10.getOperand(1); + assert(N101.getValueType() == MVT::i8 && "unexpected type"); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0)); + if (C && C->getZExtValue() == 1) + return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101); + } + } + } - // Check RHS for X-1 - if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && - isAllOnes(N1.getOperand(1))) - return DAG.getNode(X86ISD::BLSR, DL, VT, N0); + // Check for BEXTR. + if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && + (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { + ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (MaskNode && ShiftNode) { + uint64_t Mask = MaskNode->getZExtValue(); + uint64_t Shift = ShiftNode->getZExtValue(); + if (isMask_64(Mask)) { + uint64_t MaskSize = CountPopulation_64(Mask); + if (Shift + MaskSize <= VT.getSizeInBits()) + return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), + DAG.getConstant(Shift | (MaskSize << 8), VT)); + } + } + } // BEXTR return SDValue(); } @@ -16508,7 +17802,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Check LHS for vnot if (N0.getOpcode() == ISD::XOR && @@ -16592,7 +17886,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if ((SraAmt + 1) != EltBits) return SDValue(); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Now we know we at least have a plendvb with the mask val. See if // we can form a psignb/w/d. @@ -16641,7 +17935,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (ShAmt1.getOpcode() == ISD::TRUNCATE) ShAmt1 = ShAmt1.getOperand(0); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); unsigned Opc = X86ISD::SHLD; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); @@ -16688,7 +17982,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) // and change it to SUB and CMOV. @@ -16738,7 +18032,7 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, // Create BLSMSK instructions by finding X ^ (X-1) SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && isAllOnes(N0.getOperand(1))) @@ -16758,7 +18052,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, LoadSDNode *Ld = cast<LoadSDNode>(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); - DebugLoc dl = Ld->getDebugLoc(); + SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); @@ -16953,7 +18247,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, StoreSDNode *St = cast<StoreSDNode>(N); EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); - DebugLoc dl = St->getDebugLoc(); + SDLoc dl(St); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -17116,8 +18410,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) return SDValue(); - DebugLoc LdDL = Ld->getDebugLoc(); - DebugLoc StDL = N->getDebugLoc(); + SDLoc LdDL(Ld); + SDLoc StDL(N); // If we are a 64-bit capable x86, lower to a single movq load/store pair. // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. @@ -17210,7 +18504,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { RHS.getOpcode() != ISD::VECTOR_SHUFFLE) return false; - EVT VT = LHS.getValueType(); + MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); @@ -17281,23 +18575,24 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned i = 0; i != NumElts; ++i) { - int LIdx = LMask[i], RIdx = RMask[i]; - - // Ignore any UNDEF components. - if (LIdx < 0 || RIdx < 0 || - (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || - (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) - continue; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + int LIdx = LMask[i+l], RIdx = RMask[i+l]; + + // Ignore any UNDEF components. + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) + continue; - // Check that successive elements are being operated on. If not, this is - // not a horizontal operation. - unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs - unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; - int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; - if (!(LIdx == Index && RIdx == Index + 1) && - !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) - return false; + // Check that successive elements are being operated on. If not, this is + // not a horizontal operation. + unsigned Src = (i/HalfLaneElts); // each lane is split between srcs + int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) + return false; + } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. @@ -17316,7 +18611,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) - return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); + return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); return SDValue(); } @@ -17331,7 +18626,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) - return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); + return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); return SDValue(); } @@ -17368,7 +18663,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; } - return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0), + return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); } @@ -17385,6 +18680,19 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { + // FANDN(x, 0.0) -> 0.0 + // FANDN(0.0, x) -> x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + return SDValue(); +} + static SDValue PerformBTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { @@ -17412,12 +18720,12 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { if (Op.getOpcode() == X86ISD::VZEXT_LOAD && VT.getVectorElementType().getSizeInBits() == OpVT.getVectorElementType().getSizeInBits()) { - return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } return SDValue(); } -static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!VT.isVector()) @@ -17426,7 +18734,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the // both SSE and AVX2 since there is no sign-extended shift right @@ -17437,14 +18745,14 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, N0.getOpcode() == ISD::SIGN_EXTEND)) { SDValue N00 = N0.getOperand(0); - // EXTLOAD has a better solution on AVX2, + // EXTLOAD has a better solution on AVX2, // it may be replaced with X86ISD::VSEXT node. if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { - SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, + SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); } @@ -17473,7 +18781,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget* Subtarget) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. @@ -17518,7 +18826,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because // ISD::SETCC is always legalized to i8. - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -17556,17 +18864,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS, LHS.getOperand(1)); - return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, DAG.getConstant(0, addV.getValueType()), CC); } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS, RHS.getOperand(1)); - return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, DAG.getConstant(0, addV.getValueType()), CC); } return SDValue(); @@ -17575,7 +18883,7 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. -static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { +static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, MVT::i8, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), @@ -17586,7 +18894,7 @@ static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); @@ -17600,7 +18908,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { - SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(), + SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); @@ -17630,7 +18938,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue EFLAGS = N->getOperand(3); @@ -17655,7 +18963,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); @@ -17690,7 +18998,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, // We don't have a good way to replace an EFLAGS use, so only do this when // dead right now. SDValue(N, 1).use_empty()) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, @@ -17709,7 +19017,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, // (sub (sete X, 0), Y) -> sbb 0, Y // (sub (setne X, 0), Y) -> adc -1, Y static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { - DebugLoc DL = N->getDebugLoc(); + SDLoc DL(N); // Look through ZExts. SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); @@ -17755,7 +19063,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) - return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); + return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } @@ -17775,10 +19083,10 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, isa<ConstantSDNode>(Op1.getOperand(1))) { APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); EVT VT = Op0.getValueType(); - SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, + SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getConstant(~XorC, VT)); - return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, + return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, DAG.getConstant(C->getAPIntValue()+1, VT)); } } @@ -17788,7 +19096,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && isHorizontalBinOp(Op0, Op1, true)) - return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); + return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } @@ -17805,7 +19113,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, if (In.getOpcode() != X86ISD::VZEXT) return SDValue(); - return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), + return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), In.getOperand(0)); } @@ -17839,6 +19147,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: @@ -17994,6 +19303,22 @@ namespace { const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; } +static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { + + if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { + if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && + std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && + std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { + + if (AsmPieces.size() == 3) + return true; + else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) + return true; + } + } + return false; +} + bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); @@ -18035,12 +19360,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); - if (AsmPieces.size() == 4 && - AsmPieces[0] == "~{cc}" && - AsmPieces[1] == "~{dirflag}" && - AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") - return IntrinsicLowering::LowerToByteSwap(CI); + if (clobbersFlagRegisters(AsmPieces)) + return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: @@ -18053,11 +19374,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); - if (AsmPieces.size() == 4 && - AsmPieces[0] == "~{cc}" && - AsmPieces[1] == "~{dirflag}" && - AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") + if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } @@ -18365,7 +19682,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, getTargetMachine()))) return; - Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), + Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), GA->getValueType(0), Offset); break; } @@ -18380,7 +19697,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::pair<unsigned, const TargetRegisterClass*> X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const { + MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. if (Constraint.size() == 1) { @@ -18447,7 +19764,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget->hasSSE1()) break; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f32: @@ -18472,6 +19789,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v8f32: case MVT::v4f64: return std::make_pair(0U, &X86::VR256RegClass); + case MVT::v8f64: + case MVT::v16f32: + case MVT::v16i32: + case MVT::v8i64: + return std::make_pair(0U, &X86::VR512RegClass); } break; } @@ -18582,7 +19904,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } } else if (Res.second == &X86::FR32RegClass || Res.second == &X86::FR64RegClass || - Res.second == &X86::VR128RegClass) { + Res.second == &X86::VR128RegClass || + Res.second == &X86::VR256RegClass || + Res.second == &X86::FR32XRegClass || + Res.second == &X86::FR64XRegClass || + Res.second == &X86::VR128XRegClass || + Res.second == &X86::VR256XRegClass || + Res.second == &X86::VR512RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can @@ -18596,6 +19924,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res.second = &X86::VR128RegClass; else if (X86::VR256RegClass.hasType(VT)) Res.second = &X86::VR256RegClass; + else if (X86::VR512RegClass.hasType(VT)) + Res.second = &X86::VR512RegClass; } return Res; diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index 2727e22..bc3dd60 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -53,6 +53,10 @@ namespace llvm { /// to X86::XORPS or X86::XORPD. FXOR, + /// FANDN - Bitwise logical ANDNOT of floating point values. This + /// corresponds to X86::ANDNPS or X86::ANDNPD. + FANDN, + /// FSRL - Bitwise logical right shift of floating point values. These /// corresponds to X86::PSRLDQ. FSRL, @@ -250,6 +254,12 @@ namespace llvm { // VSEXT - Vector integer signed-extend. VSEXT, + // VTRUNC - Vector integer truncate. + VTRUNC, + + // VTRUNC - Vector integer truncate with mask. + VTRUNCM, + // VFPEXT - Vector FP extend. VFPEXT, @@ -270,6 +280,13 @@ namespace llvm { // PCMP* - Vector integer comparisons. PCMPEQ, PCMPGT, + // PCMP*M - Vector integer comparisons, the result is in a mask vector. + PCMPEQM, PCMPGTM, + + /// CMPM, CMPMU - Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + CMPM, + CMPMU, // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, @@ -278,18 +295,27 @@ namespace llvm { BLSI, // BLSI - Extract lowest set isolated bit BLSMSK, // BLSMSK - Get mask up to lowest set bit BLSR, // BLSR - Reset lowest set bit + BZHI, // BZHI - Zero high bits + BEXTR, // BEXTR - Bit field extract UMUL, // LOW, HI, FLAGS = umul LHS, RHS // MUL_IMM - X86 specific multiply by immediate. MUL_IMM, - // PTEST - Vector bitwise comparisons + // PTEST - Vector bitwise comparisons. PTEST, - // TESTP - Vector packed fp sign bitwise comparisons + // TESTP - Vector packed fp sign bitwise comparisons. TESTP, + // TESTM - Vector "test" in AVX-512, the result is in a mask vector. + TESTM, + + // OR/AND test for masks + KORTEST, + KTEST, + // Several flavors of instructions with vector shuffle behaviors. PALIGNR, PSHUFD, @@ -310,9 +336,13 @@ namespace llvm { UNPCKH, VPERMILP, VPERMV, + VPERMV3, VPERMI, VPERM2X128, VBROADCAST, + // masked broadcast + VBROADCASTM, + VINSERT, // PMULUDQ - Vector multiply packed unsigned doubleword integers PMULUDQ, @@ -434,25 +464,45 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace X86 { - /// isVEXTRACTF128Index - Return true if the specified + /// isVEXTRACT128Index - Return true if the specified + /// EXTRACT_SUBVECTOR operand specifies a vector extract that is + /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions. + bool isVEXTRACT128Index(SDNode *N); + + /// isVINSERT128Index - Return true if the specified + /// INSERT_SUBVECTOR operand specifies a subvector insert that is + /// suitable for input to VINSERTF128, VINSERTI128 instructions. + bool isVINSERT128Index(SDNode *N); + + /// isVEXTRACT256Index - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is - /// suitable for input to VEXTRACTF128. - bool isVEXTRACTF128Index(SDNode *N); + /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions. + bool isVEXTRACT256Index(SDNode *N); - /// isVINSERTF128Index - Return true if the specified + /// isVINSERT256Index - Return true if the specified /// INSERT_SUBVECTOR operand specifies a subvector insert that is - /// suitable for input to VINSERTF128. - bool isVINSERTF128Index(SDNode *N); + /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions. + bool isVINSERT256Index(SDNode *N); - /// getExtractVEXTRACTF128Immediate - Return the appropriate + /// getExtractVEXTRACT128Immediate - Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index - /// with VEXTRACTF128 instructions. - unsigned getExtractVEXTRACTF128Immediate(SDNode *N); + /// with VEXTRACTF128, VEXTRACTI128 instructions. + unsigned getExtractVEXTRACT128Immediate(SDNode *N); - /// getInsertVINSERTF128Immediate - Return the appropriate + /// getInsertVINSERT128Immediate - Return the appropriate /// immediate to insert at the specified INSERT_SUBVECTOR index - /// with VINSERTF128 instructions. - unsigned getInsertVINSERTF128Immediate(SDNode *N); + /// with VINSERTF128, VINSERT128 instructions. + unsigned getInsertVINSERT128Immediate(SDNode *N); + + /// getExtractVEXTRACT256Immediate - Return the appropriate + /// immediate to extract the specified EXTRACT_SUBVECTOR index + /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions. + unsigned getExtractVEXTRACT256Immediate(SDNode *N); + + /// getInsertVINSERT256Immediate - Return the appropriate + /// immediate to insert at the specified INSERT_SUBVECTOR index + /// with VINSERTF64x4, VINSERTI64x4 instructions. + unsigned getInsertVINSERT256Immediate(SDNode *N); /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. @@ -511,7 +561,7 @@ namespace llvm { /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. virtual EVT - getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const; @@ -563,7 +613,7 @@ namespace llvm { virtual const char *getTargetNodeName(unsigned Opcode) const; /// getSetCCResultType - Return the value type to use for ISD::SETCC. - virtual EVT getSetCCResultType(EVT VT) const; + virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const; /// computeMaskedBitsForTargetNode - Determine which of the bits specified /// in Mask are known to be either zero or one and return them in the @@ -610,7 +660,7 @@ namespace llvm { /// error, this returns a register number of 0. std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, - EVT VT) const; + MVT VT) const; /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. @@ -634,6 +684,8 @@ namespace llvm { virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const; virtual bool isTruncateFree(EVT VT1, EVT VT2) const; + virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const; + /// isZExtFree - Return true if any actual instruction that defines a /// value of type Ty1 implicit zero-extends the value to Ty2 in the result /// register. This does not necessarily include registers defined in @@ -646,11 +698,11 @@ namespace llvm { virtual bool isZExtFree(EVT VT1, EVT VT2) const; virtual bool isZExtFree(SDValue Val, EVT VT2) const; - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; /// isNarrowingProfitable - Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow @@ -723,6 +775,8 @@ namespace llvm { SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const; + virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const LLVM_OVERRIDE; + /// \brief Reset the operation actions based on target options. virtual void resetOperationActions(); @@ -734,7 +788,6 @@ namespace llvm { /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - const X86RegisterInfo *RegInfo; const DataLayout *TD; /// Used to store the TargetOptions so that we don't waste time resetting @@ -760,16 +813,16 @@ namespace llvm { SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::InputArg> &ArgInfo, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, unsigned i) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const; @@ -791,7 +844,7 @@ namespace llvm { bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const; SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, - int FPDiff, DebugLoc dl) const; + int FPDiff, SDLoc dl) const; unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG) const; @@ -800,37 +853,32 @@ namespace llvm { bool isSigned, bool isReplace) const; - SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, - SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, + SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, int64_t Offset, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, - DebugLoc dl, SelectionDAG &DAG) const; + SDLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -847,25 +895,13 @@ namespace llvm { SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; - - // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR - SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const; - SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const; - SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const; virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, + SDLoc dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; virtual SDValue LowerCall(CallLoweringInfo &CLI, @@ -876,7 +912,7 @@ namespace llvm { CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - DebugLoc dl, SelectionDAG &DAG) const; + SDLoc dl, SelectionDAG &DAG) const; virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const; @@ -891,6 +927,8 @@ namespace llvm { const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const; + virtual const uint16_t *getScratchRegisters(CallingConv::ID CC) const; + /// Utility function to emit atomic-load-arith operations (and, or, xor, /// nand, max, min, umax, umin). It takes the corresponding instruction to /// expand, the associated machine basic block, and the associated X86 diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td new file mode 100644 index 0000000..cb19fbd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -0,0 +1,3526 @@ +// Bitcasts between 512-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasAVX512] in { + def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + + def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>; + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion + def : Pat<(v4f64 (bitconvert (v8f32 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v8i32 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v4i64 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v32i8 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v8i32 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v4i64 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v4f64 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v32i8 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v8f32 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v8i32 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v4f64 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v32i8 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v4f64 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v4i64 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v8f32 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v8i32 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v32i8 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v8f32 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v4i64 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v4f64 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>; +} + +// +// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros. +// + +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, Predicates = [HasAVX512] in { +def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "", + [(set VR512:$dst, (v16f32 immAllZerosV))]>; +} + +def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>; + +//===----------------------------------------------------------------------===// +// AVX-512 - VECTOR INSERT +// +// -- 32x8 form -- +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR128X:$src2, i8imm:$src3), + "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512; +let mayLoad = 1 in +def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, f128mem:$src2, i8imm:$src3), + "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; +} + +// -- 64x4 fp form -- +let neverHasSideEffects = 1, ExeDomain = SSEPackedDouble in { +def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR256X:$src2, i8imm:$src3), + "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W; +let mayLoad = 1 in +def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i256mem:$src2, i8imm:$src3), + "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} +// -- 32x4 integer form -- +let neverHasSideEffects = 1 in { +def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR128X:$src2, i8imm:$src3), + "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512; +let mayLoad = 1 in +def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i128mem:$src2, i8imm:$src3), + "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +} + +let neverHasSideEffects = 1 in { +// -- 64x4 form -- +def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, VR256X:$src2, i8imm:$src3), + "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W; +let mayLoad = 1 in +def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, i256mem:$src2, i8imm:$src3), + "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2), + (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2), + (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2), + (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2), + (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; + +def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2), + (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), + (bc_v4i32 (loadv2i64 addr:$src2)), + (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2), + (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2), + (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR512:$ins))>; + +def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2), + (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2), + (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2), + (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2), + (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; + +def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2), + (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2), + (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2), + (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; +def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1), + (bc_v8i32 (loadv4i64 addr:$src2)), + (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, + (INSERT_get_vinsert256_imm VR512:$ins))>; + +// vinsertps - insert f32 to XMM +def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3), + "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>, + EVEX_4V; +def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3), + "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insrtps VR128X:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +//===----------------------------------------------------------------------===// +// AVX-512 VECTOR EXTRACT +//--- +let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +// -- 32x4 form -- +def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512; +def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs), + (ins f128mem:$dst, VR512:$src1, i8imm:$src2), + "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +// -- 64x4 form -- +def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W; +let mayStore = 1 in +def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs), + (ins f256mem:$dst, VR512:$src1, i8imm:$src2), + "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +let neverHasSideEffects = 1 in { +// -- 32x4 form -- +def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512; +def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs), + (ins i128mem:$dst, VR512:$src1, i8imm:$src2), + "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +// -- 64x4 form -- +def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst), + (ins VR512:$src1, i8imm:$src2), + "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W; +let mayStore = 1 in +def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs), + (ins i256mem:$dst, VR512:$src1, i8imm:$src2), + "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; +} + +def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), + (v4f32 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)), + (v4i32 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), + (v2f64 (VEXTRACTF32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + +def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), + (v2i64 (VEXTRACTI32x4rr VR512:$src1, + (EXTRACT_get_vextract128_imm VR128X:$ext)))>; + + +def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), + (v8f32 (VEXTRACTF64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)), + (v8i32 (VEXTRACTI64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), + (v4f64 (VEXTRACTF64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), + (v4i64 (VEXTRACTI64x4rr VR512:$src1, + (EXTRACT_get_vextract256_imm VR256X:$ext)))>; + +// A 256-bit subvector extract from the first 512-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), + (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>; +def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), + (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>; +def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), + (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>; +def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), + (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>; + +// zmm -> xmm +def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), + (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; +def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; +def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), + (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; + + +// A 128-bit subvector insert to the first 512-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; + +def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; + +// vextractps - extract 32 bits from XMM +def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), + (ins VR128X:$src1, u32u8imm:$src2), + "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, + EVEX; + +def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), + (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2), + "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), + addr:$dst)]>, EVEX; + +//===---------------------------------------------------------------------===// +// AVX-512 BROADCAST +//--- +multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr, + RegisterClass DestRC, + RegisterClass SrcRC, X86MemOperand x86memop> { + def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX; + def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),[]>, EVEX; +} +let ExeDomain = SSEPackedSingle in { + defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512, + VR128X, f32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +} + +let ExeDomain = SSEPackedDouble in { + defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512, + VR128X, f64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSZrm addr:$src)>; +def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSDZrm addr:$src)>; + +def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src), + (VBROADCASTSSZrm addr:$src)>; +def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), + (VBROADCASTSDZrm addr:$src)>; + +multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr, + RegisterClass SrcRC, RegisterClass KRC> { + def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX, EVEX_V512; + def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), + (ins KRC:$mask, SrcRC:$src), + !strconcat(OpcodeStr, + "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + []>, EVEX, EVEX_V512, EVEX_KZ; +} + +defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>; +defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>, + VEX_W; + +def : Pat <(v16i32 (X86vzext VK16WM:$mask)), + (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + +def : Pat <(v8i64 (X86vzext VK8WM:$mask)), + (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + +def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), + (VPBROADCASTDrZrr GR32:$src)>; +def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), + (VPBROADCASTQrZrr GR64:$src)>; +def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))), + (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>; + +def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), + (VPBROADCASTDrZrr GR32:$src)>; +def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))), + (VPBROADCASTQrZrr GR64:$src)>; + +multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, + RegisterClass KRC> { + def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; + def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, + VR128X:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>, + EVEX, EVEX_KZ; + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; + def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, + x86memop:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, + (ld_frag addr:$src))))]>, EVEX, EVEX_KZ; + } +} + +defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem, + loadi32, VR512, v16i32, v4i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, + loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VT1>; + +def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), + (VPBROADCASTDZrr VR128X:$src)>; +def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), + (VPBROADCASTQZrr VR128X:$src)>; + +def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), + (VBROADCASTSSZrr VR128X:$src)>; +def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), + (VBROADCASTSDZrr VR128X:$src)>; + +def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), + (VBROADCASTSSZrr VR128X:$src)>; +def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), + (VBROADCASTSDZrr VR128X:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), + (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; +def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), + (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + + +let Predicates = [HasAVX512] in { +def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), + (EXTRACT_SUBREG + (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + addr:$src)), sub_ymm)>; +} +//===----------------------------------------------------------------------===// +// AVX-512 BROADCAST MASK TO VECTOR REGISTER +//--- + +multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, + RegisterClass DstRC, RegisterClass KRC, + ValueType OpVT, ValueType SrcVT> { +def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX; +} + +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, + VK16, v16i32, v16i1>, EVEX_V512; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, + VK8, v8i64, v8i1>, EVEX_V512, VEX_W; + +//===----------------------------------------------------------------------===// +// AVX-512 - VPERM +// +// -- immediate form -- +multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, + SDNode OpNode, PatFrag mem_frag, + X86MemOperand x86memop, ValueType OpVT> { + def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>, + EVEX; + def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (OpVT (OpNode (mem_frag addr:$src1), + (i8 imm:$src2))))]>, EVEX; +} + +defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64, + i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +let ExeDomain = SSEPackedDouble in +defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, + f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +// -- VPERM - register form -- +multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, + PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { + + def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V; + + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>, + EVEX_4V; +} + +defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem, + v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, + v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +let ExeDomain = SSEPackedSingle in +defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem, + v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; +let ExeDomain = SSEPackedDouble in +defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, + v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +// -- VPERM2I - 3 source operands form -- +multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC, + PatFrag mem_frag, X86MemOperand x86memop, + ValueType OpVT> { +let Constraints = "$src1 = $dst" in { + def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpVT (X86VPermv3 RC:$src1, RC:$src2, RC:$src3)))]>, + EVEX_4V; + + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpVT (X86VPermv3 RC:$src1, RC:$src2, + (mem_frag addr:$src3))))]>, EVEX_4V; + } +} +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, + v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem, + v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem, + v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem, + v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 - BLEND using mask +// +multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, Intrinsic Int, + RegisterClass KRC, RegisterClass RC, + X86MemOperand x86memop, PatFrag mem_frag, + SDNode OpNode, ValueType vt> { + def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins KRC:$mask, RC:$src1, RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2), + (vt RC:$src1)))]>, EVEX_4V, EVEX_K; + def rr_Int : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins KRC:$mask, RC:$src1, RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set RC:$dst, (Int KRC:$mask, (vt RC:$src2), + (vt RC:$src1)))]>, EVEX_4V, EVEX_K; + + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins KRC:$mask, RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"), + []>, + EVEX_4V, EVEX_K; + + def rm_Int : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins KRC:$mask, RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"), + [(set RC:$dst, (Int KRC:$mask, (vt RC:$src1), + (mem_frag addr:$src2)))]>, + EVEX_4V, EVEX_K; + } +} + +let ExeDomain = SSEPackedSingle in +defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", + int_x86_avx512_mskblend_ps_512, + VK16WM, VR512, f512mem, + memopv16f32, vselect, v16f32>, + EVEX_CD8<32, CD8VF>, EVEX_V512; +let ExeDomain = SSEPackedDouble in +defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", + int_x86_avx512_mskblend_pd_512, + VK8WM, VR512, f512mem, + memopv8f64, vselect, v8f64>, + VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; + +defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", + int_x86_avx512_mskblend_d_512, + VK16WM, VR512, f512mem, + memopv16i32, vselect, v16i32>, + EVEX_CD8<32, CD8VF>, EVEX_V512; + +defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", + int_x86_avx512_mskblend_q_512, + VK8WM, VR512, f512mem, + memopv8i64, vselect, v8i64>, + VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; + +let Predicates = [HasAVX512] in { +def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), + (v8f32 VR256X:$src2))), + (EXTRACT_SUBREG + (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), + (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG + (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; +} + +multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, + SDNode OpNode, ValueType vt> { + def rr : AVX512BI<opc, MRMSrcReg, + (outs KRC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V; + def rm : AVX512BI<opc, MRMSrcMem, + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V; +} + +defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, + memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512; +defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, + memopv8i64, X86pcmpeqm, v8i64>, T8, EVEX_V512, VEX_W; + +defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, + memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512; +defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, + memopv8i64, X86pcmpgtm, v8i64>, T8, EVEX_V512, VEX_W; + +def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (VPCMPGTDZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; + +def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (VPCMPEQDZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; + +multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, + SDNode OpNode, ValueType vt, Operand CC, string asm, + string asm_alt> { + def rri : AVX512AIi8<opc, MRMSrcReg, + (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V; + def rmi : AVX512AIi8<opc, MRMSrcMem, + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2), + imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + // Accept explicit immediate argument form instead of comparison code. + let neverHasSideEffects = 1 in { + def rri_alt : AVX512AIi8<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + def rmi_alt : AVX512AIi8<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + } +} + +defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32, + X86cmpm, v16i32, AVXCC, + "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32, + X86cmpmu, v16i32, AVXCC, + "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64, + X86cmpm, v8i64, AVXCC, + "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64, + X86cmpmu, v8i64, AVXCC, + "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + +// avx512_cmp_packed - sse 1 & 2 compare packed instructions +multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, + X86MemOperand x86memop, Operand CC, + SDNode OpNode, ValueType vt, string asm, + string asm_alt, Domain d> { + def rri : AVX512PIi8<0xC2, MRMSrcReg, + (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>; + def rmi : AVX512PIi8<0xC2, MRMSrcMem, + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set KRC:$dst, + (OpNode (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>; + + // Accept explicit immediate argument form instead of comparison code. + let neverHasSideEffects = 1 in { + def rri_alt : AVX512PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + asm_alt, [], d>; + def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + asm_alt, [], d>; + } +} + +defm VCMPPSZ : avx512_cmp_packed<VK16, VR512, f512mem, AVXCC, X86cmpm, v16f32, + "vcmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vcmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedSingle>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VCMPPDZ : avx512_cmp_packed<VK8, VR512, f512mem, AVXCC, X86cmpm, v8f64, + "vcmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vcmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedDouble>, OpSize, EVEX_4V, VEX_W, EVEX_V512, + EVEX_CD8<64, CD8VF>; + +def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VCMPPSZrri + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; +def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VPCMPDZrri + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; +def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VPCMPUDZrri + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; + +// Mask register copy, including +// - copy between mask registers +// - load/store mask registers +// - copy from GPR to mask register and vice versa +// +multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, + string OpcodeStr, RegisterClass KRC, + ValueType vt, X86MemOperand x86memop> { + let neverHasSideEffects = 1 in { + def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + let mayLoad = 1 in + def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (vt (load addr:$src)))]>; + let mayStore = 1 in + def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + } +} + +multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, + string OpcodeStr, + RegisterClass KRC, RegisterClass GRC> { + let neverHasSideEffects = 1 in { + def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + } +} + +let Predicates = [HasAVX512] in { + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, + VEX, TB; + defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, + VEX, TB; +} + +let Predicates = [HasAVX512] in { + // GR16 from/to 16-bit mask + def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), + (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>; + def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), + (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>; + + // Store kreg in memory + def : Pat<(store (v16i1 VK16:$src), addr:$dst), + (KMOVWmk addr:$dst, VK16:$src)>; + + def : Pat<(store (v8i1 VK8:$src), addr:$dst), + (KMOVWmk addr:$dst, (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16)))>; +} +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512] in { + // GR from/to 8-bit mask without native support + def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (COPY_TO_REGCLASS + (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VK8)>; + def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit)>; +} + +// Mask unary operation +// - KNOT +multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (OpNode KRC:$src))]>; +} + +multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, TB; +} + +defm KNOT : avx512_mask_unop_w<0x44, "knot", not>; + +def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>; +def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; + +// With AVX-512, 8-bit mask is promoted to 16-bit mask. +def : Pat<(not VK8:$src), + (COPY_TO_REGCLASS + (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; + +// Mask binary operation +// - KADD, KAND, KANDN, KOR, KXNOR, KXOR +multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>; +} + +multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX_4V, VEX_L, TB; +} + +def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; +def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; + +let isCommutable = 1 in { + defm KADD : avx512_mask_binop_w<0x4a, "kadd", add>; + defm KAND : avx512_mask_binop_w<0x41, "kand", and>; + let isCommutable = 0 in + defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>; + defm KOR : avx512_mask_binop_w<0x45, "kor", or>; + defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>; + defm KXOR : avx512_mask_binop_w<0x47, "kxor", xor>; +} + +multiclass avx512_mask_binop_int<string IntName, string InstName> { + let Predicates = [HasAVX512] in + def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1") + VK16:$src1, VK16:$src2), + (!cast<Instruction>(InstName##"Wrr") VK16:$src1, VK16:$src2)>; +} + +defm : avx512_mask_binop_int<"kadd", "KADD">; +defm : avx512_mask_binop_int<"kand", "KAND">; +defm : avx512_mask_binop_int<"kandn", "KANDN">; +defm : avx512_mask_binop_int<"kor", "KOR">; +defm : avx512_mask_binop_int<"kxnor", "KXNOR">; +defm : avx512_mask_binop_int<"kxor", "KXOR">; +// With AVX-512, 8-bit mask is promoted to 16-bit mask. +multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> { + let Predicates = [HasAVX512] in + def : Pat<(OpNode VK8:$src1, VK8:$src2), + (COPY_TO_REGCLASS + (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; +} + +defm : avx512_binop_pat<and, KANDWrr>; +defm : avx512_binop_pat<andn, KANDNWrr>; +defm : avx512_binop_pat<or, KORWrr>; +defm : avx512_binop_pat<xnor, KXNORWrr>; +defm : avx512_binop_pat<xor, KXORWrr>; + +// Mask unpacking +multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, + RegisterClass KRC1, RegisterClass KRC2> { + let Predicates = [HasAVX512] in + def rr : I<opc, MRMSrcReg, (outs KRC1:$dst), (ins KRC2:$src1, KRC2:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; +} + +multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> { + defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16, VK8>, + VEX_4V, VEX_L, OpSize, TB; +} + +defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">; + +multiclass avx512_mask_unpck_int<string IntName, string InstName> { + let Predicates = [HasAVX512] in + def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1") + VK8:$src1, VK8:$src2), + (!cast<Instruction>(InstName##"BWrr") VK8:$src1, VK8:$src2)>; +} + +defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; +// Mask bit testing +multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode> { + let Predicates = [HasAVX512], Defs = [EFLAGS] in + def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; +} + +multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, TB; +} + +defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest>; + +// Mask shift +multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode> { + let Predicates = [HasAVX512] in + def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm), + !strconcat(OpcodeStr, + "\t{$imm, $src, $dst|$dst, $src, $imm}"), + [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>; +} + +multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, + SDNode OpNode> { + defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, OpSize, TA, VEX_W; +} + +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", shl>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", srl>; + +// Mask setting all 0s or 1s +multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { + let Predicates = [HasAVX512] in + let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in + def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", + [(set KRC:$dst, (VT Val))]>; +} + +multiclass avx512_mask_setop_w<PatFrag Val> { + defm B : avx512_mask_setop<VK8, v8i1, Val>; + defm W : avx512_mask_setop<VK16, v16i1, Val>; +} + +defm KSET0 : avx512_mask_setop_w<immAllZerosV>; +defm KSET1 : avx512_mask_setop_w<immAllOnesV>; + +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512] in { + def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; + def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; +} +def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>; + +def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>; + +def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), + (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Aligned and unaligned load and store +// + +multiclass avx512_mov_packed<bits<8> opc, RegisterClass RC, RegisterClass KRC, + X86MemOperand x86memop, PatFrag ld_frag, + string asm, Domain d> { +let neverHasSideEffects = 1 in + def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, + EVEX; +let canFoldAsLoad = 1 in + def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (ld_frag addr:$src))], d>, EVEX; +let Constraints = "$src1 = $dst" in { + def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2), + !strconcat(asm, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, + EVEX, EVEX_K; + def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, x86memop:$src2), + !strconcat(asm, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + [], d>, EVEX, EVEX_K; +} +} + +defm VMOVAPSZ : avx512_mov_packed<0x28, VR512, VK16WM, f512mem, alignedloadv16f32, + "vmovaps", SSEPackedSingle>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVAPDZ : avx512_mov_packed<0x28, VR512, VK8WM, f512mem, alignedloadv8f64, + "vmovapd", SSEPackedDouble>, + OpSize, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32, + "vmovups", SSEPackedSingle>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64, + "vmovupd", SSEPackedDouble>, + OpSize, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), + "vmovaps\t{$src, $dst|$dst, $src}", + [(alignedstore512 (v16f32 VR512:$src), addr:$dst)], + SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; +def VMOVAPDZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), + "vmovapd\t{$src, $dst|$dst, $src}", + [(alignedstore512 (v8f64 VR512:$src), addr:$dst)], + SSEPackedDouble>, EVEX, EVEX_V512, + OpSize, VEX_W, EVEX_CD8<64, CD8VF>; +def VMOVUPSZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), + "vmovups\t{$src, $dst|$dst, $src}", + [(store (v16f32 VR512:$src), addr:$dst)], + SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; +def VMOVUPDZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src), + "vmovupd\t{$src, $dst|$dst, $src}", + [(store (v8f64 VR512:$src), addr:$dst)], + SSEPackedDouble>, EVEX, EVEX_V512, + OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + +let neverHasSideEffects = 1 in { + def VMOVDQA32rr : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src), + "vmovdqa32\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512; + def VMOVDQA64rr : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src), + "vmovdqa64\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, VEX_W; +let mayStore = 1 in { + def VMOVDQA32mr : AVX512BI<0x7F, MRMDestMem, (outs), + (ins i512mem:$dst, VR512:$src), + "vmovdqa32\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + def VMOVDQA64mr : AVX512BI<0x7F, MRMDestMem, (outs), + (ins i512mem:$dst, VR512:$src), + "vmovdqa64\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} +let mayLoad = 1 in { +def VMOVDQA32rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), + "vmovdqa32\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; +def VMOVDQA64rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), + "vmovdqa64\t{$src, $dst|$dst, $src}", []>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} +} + +// 512-bit aligned load/store +def : Pat<(alignedloadv8i64 addr:$src), (VMOVDQA64rm addr:$src)>; +def : Pat<(alignedloadv16i32 addr:$src), (VMOVDQA32rm addr:$src)>; + +def : Pat<(alignedstore512 (v8i64 VR512:$src), addr:$dst), + (VMOVDQA64mr addr:$dst, VR512:$src)>; +def : Pat<(alignedstore512 (v16i32 VR512:$src), addr:$dst), + (VMOVDQA32mr addr:$dst, VR512:$src)>; + +multiclass avx512_mov_int<bits<8> load_opc, bits<8> store_opc, string asm, + RegisterClass RC, RegisterClass KRC, + PatFrag ld_frag, X86MemOperand x86memop> { +let neverHasSideEffects = 1 in + def rr : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX; +let canFoldAsLoad = 1 in + def rm : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (ld_frag addr:$src))]>, EVEX; +let mayStore = 1 in + def mr : AVX512XSI<store_opc, MRMDestMem, (outs), + (ins x86memop:$dst, VR512:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX; +let Constraints = "$src1 = $dst" in { + def rrk : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2), + !strconcat(asm, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, + EVEX, EVEX_K; + def rmk : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, x86memop:$src2), + !strconcat(asm, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + []>, EVEX, EVEX_K; +} +} + +defm VMOVDQU32 : avx512_mov_int<0x6F, 0x7F, "vmovdqu32", VR512, VK16WM, + memopv16i32, i512mem>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVDQU64 : avx512_mov_int<0x6F, 0x7F, "vmovdqu64", VR512, VK8WM, + memopv8i64, i512mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +// 512-bit unaligned load/store +def : Pat<(loadv8i64 addr:$src), (VMOVDQU64rm addr:$src)>; +def : Pat<(loadv16i32 addr:$src), (VMOVDQU32rm addr:$src)>; + +def : Pat<(store (v8i64 VR512:$src), addr:$dst), + (VMOVDQU64mr addr:$dst, VR512:$src)>; +def : Pat<(store (v16i32 VR512:$src), addr:$dst), + (VMOVDQU32mr addr:$dst, VR512:$src)>; + +let AddedComplexity = 20 in { +def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), + (v16f32 VR512:$src2))), + (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; +def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), + (v8f64 VR512:$src2))), + (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; +def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), + (v16i32 VR512:$src2))), + (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; +def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), + (v8i64 VR512:$src2))), + (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; +} +// Move Int Doubleword to Packed Double Int +// +def VMOVDI2PDIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, + EVEX, VEX_LIG; +def VMOVDI2PDIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; +def VMOV64toPQIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG; +let isCodeGenOnly = 1 in { +def VMOV64toSDZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; +def VMOVSDto64Zrr : AVX512SI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; +} +def VMOVSDto64Zmr : AVX512SI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>, + EVEX_CD8<64, CD8VT1>; + +// Move Int Doubleword to Single Scalar +// +let isCodeGenOnly = 1 in { +def VMOVDI2SSZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, EVEX, VEX_LIG; + +def VMOVDI2SSZrm : AVX512SI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; +} + +// Move Packed Doubleword Int to Packed Double Int +// +def VMOVPDI2DIZrr : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, + EVEX, VEX_LIG; +def VMOVPDI2DIZmr : AVX512SI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128X:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(store (i32 (vector_extract (v4i32 VR128X:$src), + (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + +// Move Packed Doubleword Int first element to Doubleword Int +// +def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>, TB, OpSize, EVEX, VEX_LIG, VEX_W, + Requires<[HasAVX512, In64BitMode]>; + +def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs), + (ins i64mem:$dst, VR128X:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), + addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, OpSize, VEX_LIG, VEX_W, TB, EVEX_CD8<64, CD8VT1>, + Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; + +// Move Scalar Single to Double Int +// +let isCodeGenOnly = 1 in { +def VMOVSS2DIZrr : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), + (ins FR32X:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32X:$src))], + IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG; +def VMOVSS2DIZmr : AVX512SI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, FR32X:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; +} + +// Move Quadword Int to Packed Quadword Int +// +def VMOVQI2PQIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, + EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + +//===----------------------------------------------------------------------===// +// AVX-512 MOVSS, MOVSD +//===----------------------------------------------------------------------===// + +multiclass avx512_move_scalar <string asm, RegisterClass RC, + SDNode OpNode, ValueType vt, + X86MemOperand x86memop, PatFrag mem_pat> { + def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128X:$dst, (vt (OpNode VR128X:$src1, + (scalar_to_vector RC:$src2))))], + IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; + def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, + EVEX, VEX_LIG; + def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + EVEX, VEX_LIG; +} + +let ExeDomain = SSEPackedSingle in +defm VMOVSSZ : avx512_move_scalar<"movss{z}", FR32X, X86Movss, v4f32, f32mem, + loadf32>, XS, EVEX_CD8<32, CD8VT1>; + +let ExeDomain = SSEPackedDouble in +defm VMOVSDZ : avx512_move_scalar<"movsd{z}", FR64X, X86Movsd, v2f64, f64mem, + loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; + + +// For the disassembler +let isCodeGenOnly = 1 in { + def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, FR32X:$src2), + "movss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_MOV_S_RR>, + XS, EVEX_4V, VEX_LIG; + def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, FR64X:$src2), + "movsd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_MOV_S_RR>, + XD, EVEX_4V, VEX_LIG, VEX_W; +} + +let Predicates = [HasAVX512] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128X then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))), + (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), + (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), + (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))), + (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; + } + + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + } + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)), + FR32X:$src)), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)), + FR64X:$src)), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)), + (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))), + addr:$dst), + (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; + def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))), + addr:$dst), + (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>; + + // Shuffle with VMOVSS + def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)), + (VMOVSSZrr (v4i32 VR128X:$src1), + (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>; + def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)), + (VMOVSSZrr (v4f32 VR128X:$src1), + (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>; + + // 256-bit variants + def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)), + sub_xmm)>; + + // Shuffle with VMOVSD + def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + + // 256-bit variants + def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)), + sub_xmm)>; + + def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; +} + +let AddedComplexity = 15 in +def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v2i64 (X86vzmovl + (v2i64 VR128X:$src))))], + IIC_SSE_MOVQ_RR>, EVEX, VEX_W; + +let AddedComplexity = 20 in +def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, + EVEX_CD8<8, CD8VT8>; + +let Predicates = [HasAVX512] in { + // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. + let AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIZrr GR64:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (VMOVDI2PDIZrr GR32:$src)>; + + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVZPQILo2PQIZrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), + (VMOVZPQILo2PQIZrr VR128X:$src)>; + } + + // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; +} + +def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; + +def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; + +def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; + +def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Integer arithmetic +// +multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, PatFrag scalar_mfrag, + X86MemOperand x86scalar_mop, string BrdcstStr, + OpndItins itins, bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], + itins.rr>, EVEX_4V; + def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))], + itins.rm>, EVEX_4V; + def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86scalar_mop:$src2), + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, + ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), + [(set RC:$dst, (OpNode RC:$src1, + (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))], + itins.rm>, EVEX_4V, EVEX_B; +} +multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, EVEX_4V, VEX_W; + def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, EVEX_4V, VEX_W; +} + +defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 0>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, + T8, EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>, + EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W; + +defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, + VR512, memopv8i64, i512mem, SSE_INTALU_ITINS_P, 1>, T8, + EVEX_V512, EVEX_CD8<64, CD8VF>; + +defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, + VR512, memopv8i64, i512mem, SSE_INTMUL_ITINS_P, 1>, EVEX_V512, + EVEX_CD8<64, CD8VF>; + +def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))), + (VPMULUDQZrr VR512:$src1, VR512:$src2)>; + +defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, + T8, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>, + T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>, + T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, + T8, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>, + T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>, + T8, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>, + T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Unpack Instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt, + PatFrag mem_frag, RegisterClass RC, + X86MemOperand x86memop, string asm, + Domain d> { + def rr : AVX512PI<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, RC:$src2)))], + d>, EVEX_4V; + def rm : AVX512PI<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, + (bitconvert (mem_frag addr:$src2)))))], + d>, EVEX_4V; +} + +defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64, + VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64, + VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64, + VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64, + VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { + def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], + IIC_SSE_UNPCK>, EVEX_4V; + def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))], + IIC_SSE_UNPCK>, EVEX_4V; +} +defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32, + VR512, memopv16i32, i512mem>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64, + VR512, memopv8i64, i512mem>, EVEX_V512, + VEX_W, EVEX_CD8<64, CD8VF>; +defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, + VR512, memopv16i32, i512mem>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, + VR512, memopv8i64, i512mem>, EVEX_V512, + VEX_W, EVEX_CD8<64, CD8VF>; +//===----------------------------------------------------------------------===// +// AVX-512 - PSHUFD +// + +multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, + SDNode OpNode, PatFrag mem_frag, + X86MemOperand x86memop, ValueType OpVT> { + def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>, + EVEX; + def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (OpVT (OpNode (mem_frag addr:$src1), + (i8 imm:$src2))))]>, EVEX; +} + +defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, + i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>; + +let ExeDomain = SSEPackedSingle in +defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp, + memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512, + EVEX_CD8<32, CD8VF>; +let ExeDomain = SSEPackedDouble in +defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp, + memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512, + VEX_W, EVEX_CD8<32, CD8VF>; + +def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))), + (VPERMILPSZri VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))), + (VPERMILPDZri VR512:$src1, imm:$imm)>; + +//===----------------------------------------------------------------------===// +// AVX-512 Logical Instructions +//===----------------------------------------------------------------------===// + +defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPORDZ : avx512_binop_rm<0xEB, "vpord", or, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPORQZ : avx512_binop_rm<0xEB, "vporq", or, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VR512, memopv16i32, + i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VR512, + memopv16i32, i512mem, loadi32, i32mem, "{1to16}", + SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VR512, memopv8i64, + i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 0>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 FP arithmetic +//===----------------------------------------------------------------------===// + +multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins> { + defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss{z}"), OpNode, FR32X, + f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd{z}"), OpNode, FR64X, + f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG, + EVEX_CD8<64, CD8VT1>; +} + +let isCommutable = 1 in { +defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>; +defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>; +defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>; +defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>; +} +let isCommutable = 0 in { +defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>; +defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; +} + +multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag, + X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, + string BrdcstStr, + Domain d, OpndItins itins, bit commutable> { + let isCommutable = commutable in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>, + EVEX_4V, TB; + let mayLoad = 1 in { + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], + itins.rm, d>, EVEX_4V, TB; + def rmb : PI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86scalar_mop:$src2), + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, + ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), + [(set RC:$dst, (OpNode RC:$src1, + (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))], + itins.rm, d>, EVEX_4V, EVEX_B, TB; + } +} + +defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 1>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 1>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 1>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 1>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; +defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 1>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VR512, v16f32, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, + SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 0>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; +defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, + SSE_ALU_ITINS_P.d, 0>, + EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 VPTESTM instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, + SDNode OpNode, ValueType vt> { + def rr : AVX5128I<opc, MRMSrcReg, + (outs KRC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))]>, EVEX_4V; + def rm : AVX5128I<opc, MRMSrcMem, + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode (vt RC:$src1), + (bitconvert (memop_frag addr:$src2))))]>, EVEX_4V; +} + +defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem, + memopv16i32, X86testm, v16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, + memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 Shift instructions +//===----------------------------------------------------------------------===// +multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, RegisterClass RC, + ValueType vt, X86MemOperand x86memop, PatFrag mem_frag, + RegisterClass KRC> { + def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst), + (ins RC:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))], + SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V; + def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst), + (ins KRC:$mask, RC:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), + [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K; + def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst), + (ins x86memop:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpNode (mem_frag addr:$src1), + (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V; + def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst), + (ins KRC:$mask, x86memop:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), + [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K; +} + +multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, ValueType SrcVT, + PatFrag bc_frag, RegisterClass KRC> { + // src2 is always 128-bit + def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, VR128X:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))], + SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V; + def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), + (ins KRC:$mask, RC:$src1, VR128X:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), + [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K; + def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (vt (OpNode RC:$src1, + (bc_frag (memopv2i64 addr:$src2)))))], + SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V; + def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), + (ins KRC:$mask, RC:$src1, i128mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), + [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K; +} + +defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, + VR512, v16i32, i512mem, memopv16i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl, + VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, + EVEX_CD8<32, CD8VQ>; + +defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, + VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + EVEX_CD8<64, CD8VF>, VEX_W; +defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl, + VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, + EVEX_CD8<64, CD8VQ>, VEX_W; + +defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, + VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl, + VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, + EVEX_CD8<32, CD8VQ>; + +defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, + VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + EVEX_CD8<64, CD8VF>, VEX_W; +defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl, + VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, + EVEX_CD8<64, CD8VQ>, VEX_W; + +defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, + VR512, v16i32, i512mem, memopv16i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra, + VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, + EVEX_CD8<32, CD8VQ>; + +defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, + VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + EVEX_CD8<64, CD8VF>, VEX_W; +defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra, + VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, + EVEX_CD8<64, CD8VQ>, VEX_W; + +//===-------------------------------------------------------------------===// +// Variable Bit Shifts +//===-------------------------------------------------------------------===// +multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag> { + def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (vt (OpNode RC:$src1, (vt RC:$src2))))]>, + EVEX_4V; + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>, + EVEX_4V; +} + +defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, + i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, + i512mem, memopv8i64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, + i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, + i512mem, memopv8i64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, + i512mem, memopv16i32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, + i512mem, memopv8i64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, + X86MemOperand x86memop, PatFrag memop_frag> { +def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; +def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, + (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; +} + +defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), + (VMOVDDUPZrm addr:$src)>; + +//===---------------------------------------------------------------------===// +// Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, + ValueType vt, RegisterClass RC, PatFrag mem_frag, + X86MemOperand x86memop> { + def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX; + let mayLoad = 1 in + def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX; +} + +defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v16f32, VR512, memopv16f32, f512mem>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v16f32, VR512, memopv16f32, f512mem>, EVEX_V512, + EVEX_CD8<32, CD8VF>; + +def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>; +def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))), + (VMOVSHDUPZrm addr:$src)>; +def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>; +def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))), + (VMOVSLDUPZrm addr:$src)>; + +//===----------------------------------------------------------------------===// +// Move Low to High and High to Low packed FP Instructions +//===----------------------------------------------------------------------===// +def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))], + IIC_SSE_MOV_LH>, EVEX_4V; +def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))], + IIC_SSE_MOV_LH>, EVEX_4V; + +let Predicates = [HasAVX512] in { + // MOVLHPS patterns + def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)), + (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)), + (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>; + + // MOVHLPS patterns + def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)), + (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>; +} + +//===----------------------------------------------------------------------===// +// FMA - Fused Multiply Operations +// +let Constraints = "$src1 = $dst" in { +multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, + string BrdcstStr, SDNode OpNode, ValueType OpVT> { + def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr,"\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>; + + let mayLoad = 1 in + def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3))))]>; + def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86scalar_mop:$src3), + !strconcat(OpcodeStr, "\t{${src3}", BrdcstStr, + ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2, + (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B; +} +} // Constraints = "$src1 = $dst" + +let ExeDomain = SSEPackedSingle in { + defm VFMADD213PSZ : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fmadd, v16f32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; + defm VFMSUB213PSZ : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fmsub, v16f32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; + defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fmaddsub, v16f32>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fmsubadd, v16f32>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFNMADD213PSZ : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fnmadd, v16f32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; + defm VFNMSUB213PSZ : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fnmsub, v16f32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +} +let ExeDomain = SSEPackedDouble in { + defm VFMADD213PDZ : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fmadd, v8f64>, EVEX_V512, + VEX_W, EVEX_CD8<64, CD8VF>; + defm VFMSUB213PDZ : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fmsub, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fnmadd, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fnmsub, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +} + +let Constraints = "$src1 = $dst" in { +multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, + string BrdcstStr, SDNode OpNode, ValueType OpVT> { + let mayLoad = 1 in + def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src3, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"), + [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>; + def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src3, x86scalar_mop:$src2), + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, + ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"), + [(set RC:$dst, (OpNode RC:$src1, + (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B; +} +} // Constraints = "$src1 = $dst" + + +let ExeDomain = SSEPackedSingle in { + defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fmadd, v16f32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; + defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fmsub, v16f32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; + defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fmaddsub, v16f32>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fmsubadd, v16f32>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fnmadd, v16f32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; + defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem, + memopv16f32, f32mem, loadf32, "{1to16}", + X86Fnmsub, v16f32>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +} +let ExeDomain = SSEPackedDouble in { + defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fmadd, v8f64>, EVEX_V512, + VEX_W, EVEX_CD8<64, CD8VF>; + defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fmsub, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fnmadd, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem, + memopv8f64, f64mem, loadf64, "{1to8}", + X86Fnmsub, v8f64>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; +} + +// Scalar FMA +let Constraints = "$src1 = $dst" in { +multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType OpVT, + X86MemOperand x86memop, Operand memop, + PatFrag mem_frag> { + let isCommutable = 1 in + def r : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; + let mayLoad = 1 in + def m : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src2, RC:$src1, + (mem_frag addr:$src3))))]>; +} + +} // Constraints = "$src1 = $dst" + +defm VFMADDSSZ : avx512_fma3s_rm<0xA9, "vfmadd213ss{z}", X86Fmadd, FR32X, + f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>; +defm VFMADDSDZ : avx512_fma3s_rm<0xA9, "vfmadd213sd{z}", X86Fmadd, FR64X, + f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VFMSUBSSZ : avx512_fma3s_rm<0xAB, "vfmsub213ss{z}", X86Fmsub, FR32X, + f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>; +defm VFMSUBSDZ : avx512_fma3s_rm<0xAB, "vfmsub213sd{z}", X86Fmsub, FR64X, + f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VFNMADDSSZ : avx512_fma3s_rm<0xAD, "vfnmadd213ss{z}", X86Fnmadd, FR32X, + f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>; +defm VFNMADDSDZ : avx512_fma3s_rm<0xAD, "vfnmadd213sd{z}", X86Fnmadd, FR64X, + f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VFNMSUBSSZ : avx512_fma3s_rm<0xAF, "vfnmsub213ss{z}", X86Fnmsub, FR32X, + f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>; +defm VFNMSUBSDZ : avx512_fma3s_rm<0xAF, "vfnmsub213sd{z}", X86Fnmsub, FR64X, + f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>; + +//===----------------------------------------------------------------------===// +// AVX-512 Scalar convert from sign integer to float/double +//===----------------------------------------------------------------------===// + +multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm> { +let neverHasSideEffects = 1 in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + EVEX_4V; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + EVEX_4V; +} // neverHasSideEffects = 1 +} +let Predicates = [HasAVX512] in { +defm VCVTSI2SSZ : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}{z}">, + XS, VEX_LIG, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SSZ : avx512_vcvtsi<0x2A, GR64, FR32X, i64mem, "cvtsi2ss{q}{z}">, + XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>; +defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, GR32, FR64X, i32mem, "cvtsi2sd{l}{z}">, + XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SDZ : avx512_vcvtsi<0x2A, GR64, FR64X, i64mem, "cvtsi2sd{q}{z}">, + XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>; + +def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; + +def : Pat<(f32 (sint_to_fp GR32:$src)), + (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f32 (sint_to_fp GR64:$src)), + (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; +def : Pat<(f64 (sint_to_fp GR32:$src)), + (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f64 (sint_to_fp GR64:$src)), + (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; + +defm VCVTUSI2SSZ : avx512_vcvtsi<0x7B, GR32, FR32X, i32mem, "cvtusi2ss{l}{z}">, + XS, VEX_LIG, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SSZ : avx512_vcvtsi<0x7B, GR64, FR32X, i64mem, "cvtusi2ss{q}{z}">, + XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>; +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, GR32, FR64X, i32mem, "cvtusi2sd{l}{z}">, + XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SDZ : avx512_vcvtsi<0x7B, GR64, FR64X, i64mem, "cvtusi2sd{q}{z}">, + XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>; + +def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), + (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))), + (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))), + (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))), + (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; + +def : Pat<(f32 (uint_to_fp GR32:$src)), + (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f32 (uint_to_fp GR64:$src)), + (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; +def : Pat<(f64 (uint_to_fp GR32:$src)), + (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f64 (uint_to_fp GR64:$src)), + (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 Scalar convert from float/double to integer +//===----------------------------------------------------------------------===// +multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, Operand memop, ComplexPattern mem_cpat, + string asm> { +let neverHasSideEffects = 1 in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG; +} // neverHasSideEffects = 1 +} +let Predicates = [HasAVX512] in { +// Convert float/double to signed/unsigned int 32/64 +defm VCVTSS2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, + ssmem, sse_load_f32, "cvtss2si{z}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64, + ssmem, sse_load_f32, "cvtss2si{z}">, + XS, VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi, + ssmem, sse_load_f32, "cvtss2usi{z}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64, + int_x86_avx512_cvtss2usi64, ssmem, + sse_load_f32, "cvtss2usi{z}">, XS, VEX_W, + EVEX_CD8<32, CD8VT1>; +defm VCVTSD2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, + sdmem, sse_load_f64, "cvtsd2si{z}">, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64, + sdmem, sse_load_f64, "cvtsd2si{z}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi, + sdmem, sse_load_f64, "cvtsd2usi{z}">, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64, + int_x86_avx512_cvtsd2usi64, sdmem, + sse_load_f64, "cvtsd2usi{z}">, XD, VEX_W, + EVEX_CD8<64, CD8VT1>; + +defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}{z}", + SSE_CVT_Scalar, 0>, XS, EVEX_4V; +defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}{z}", + SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W; +defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}{z}", + SSE_CVT_Scalar, 0>, XD, EVEX_4V; +defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}{z}", + SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W; + +defm Int_VCVTUSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, + int_x86_avx512_cvtusi2ss, i32mem, loadi32, "cvtusi2ss{l}{z}", + SSE_CVT_Scalar, 0>, XS, EVEX_4V; +defm Int_VCVTUSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, + int_x86_avx512_cvtusi642ss, i64mem, loadi64, "cvtusi2ss{q}{z}", + SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W; +defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, + int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}{z}", + SSE_CVT_Scalar, 0>, XD, EVEX_4V; +defm Int_VCVTUSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, + int_x86_avx512_cvtusi642sd, i64mem, loadi64, "cvtusi2sd{q}{z}", + SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W; + +// Convert float/double to signed/unsigned int 32/64 with truncation +defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si, + ssmem, sse_load_f32, "cvttss2si{z}">, + XS, EVEX_CD8<32, CD8VT1>; +defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64, + int_x86_sse_cvttss2si64, ssmem, sse_load_f32, + "cvttss2si{z}">, XS, VEX_W, + EVEX_CD8<32, CD8VT1>; +defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si, + sdmem, sse_load_f64, "cvttsd2si{z}">, XD, + EVEX_CD8<64, CD8VT1>; +defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64, + int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, + "cvttsd2si{z}">, XD, VEX_W, + EVEX_CD8<64, CD8VT1>; +defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32, + int_x86_avx512_cvttss2usi, ssmem, sse_load_f32, + "cvttss2si{z}">, XS, EVEX_CD8<32, CD8VT1>; +defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64, + int_x86_avx512_cvttss2usi64, ssmem, + sse_load_f32, "cvttss2usi{z}">, XS, VEX_W, + EVEX_CD8<32, CD8VT1>; +defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32, + int_x86_avx512_cvttsd2usi, + sdmem, sse_load_f64, "cvttsd2usi{z}">, XD, + EVEX_CD8<64, CD8VT1>; +defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64, + int_x86_avx512_cvttsd2usi64, sdmem, + sse_load_f64, "cvttsd2usi{z}">, XD, VEX_W, + EVEX_CD8<64, CD8VT1>; +} + +multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, + string asm> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX; +} + +defm VCVTTSS2SIZ : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem, + loadf32, "cvttss2si{z}">, XS, + EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2USIZ : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem, + loadf32, "cvttss2usi{z}">, XS, + EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2SI64Z : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem, + loadf32, "cvttss2si{z}">, XS, VEX_W, + EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem, + loadf32, "cvttss2usi{z}">, XS, VEX_W, + EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2SIZ : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem, + loadf64, "cvttsd2si{z}">, XD, + EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2USIZ : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem, + loadf64, "cvttsd2usi{z}">, XD, + EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2SI64Z : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem, + loadf64, "cvttsd2si{z}">, XD, VEX_W, + EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem, + loadf64, "cvttsd2usi{z}">, XD, VEX_W, + EVEX_CD8<64, CD8VT1>; +//===----------------------------------------------------------------------===// +// AVX-512 Convert form float to double and back +//===----------------------------------------------------------------------===// +let neverHasSideEffects = 1 in { +def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst), + (ins FR32X:$src1, FR32X:$src2), + "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; +let mayLoad = 1 in +def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst), + (ins FR32X:$src1, f32mem:$src2), + "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, + EVEX_CD8<32, CD8VT1>; + +// Convert scalar double to scalar single +def VCVTSD2SSZrr : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst), + (ins FR64X:$src1, FR64X:$src2), + "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>; +let mayLoad = 1 in +def VCVTSD2SSZrm : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst), + (ins FR64X:$src1, f64mem:$src2), + "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX_4V, VEX_LIG, VEX_W, + Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>; +} + +def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>, + Requires<[HasAVX512]>; +def : Pat<(fextend (loadf32 addr:$src)), + (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>; + +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX512, OptForSize]>; + +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, + Requires<[HasAVX512, OptForSpeed]>; + +def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, + Requires<[HasAVX512]>; + +multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC, + RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, + X86MemOperand x86memop, ValueType OpVT, ValueType InVT, + Domain d> { +let neverHasSideEffects = 1 in { + def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX; + let mayLoad = 1 in + def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX; +} // neverHasSideEffects = 1 +} + +defm VCVTPD2PSZ : avx512_vcvt_fp<0x5A, "vcvtpd2ps", VR512, VR256X, fround, + memopv8f64, f512mem, v8f32, v8f64, + SSEPackedSingle>, EVEX_V512, VEX_W, OpSize, + EVEX_CD8<64, CD8VF>; + +defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend, + memopv4f64, f256mem, v8f64, v8f32, + SSEPackedDouble>, EVEX_V512, EVEX_CD8<32, CD8VH>; +def : Pat<(v8f64 (extloadv8f32 addr:$src)), + (VCVTPS2PDZrm addr:$src)>; + +//===----------------------------------------------------------------------===// +// AVX-512 Vector convert from sign integer to float/double +//===----------------------------------------------------------------------===// + +defm VCVTDQ2PSZ : avx512_vcvt_fp<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp, + memopv8i64, i512mem, v16f32, v16i32, + SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp, + memopv4i64, i256mem, v8f64, v8i32, + SSEPackedDouble>, EVEX_V512, XS, + EVEX_CD8<32, CD8VH>; + +defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, + memopv16f32, f512mem, v16i32, v16f32, + SSEPackedSingle>, EVEX_V512, XS, + EVEX_CD8<32, CD8VF>; + +defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, + memopv8f64, f512mem, v8i32, v8f64, + SSEPackedDouble>, EVEX_V512, OpSize, VEX_W, + EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint, + memopv16f32, f512mem, v16i32, v16f32, + SSEPackedSingle>, EVEX_V512, + EVEX_CD8<32, CD8VF>; + +defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint, + memopv8f64, f512mem, v8i32, v8f64, + SSEPackedDouble>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + +defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, + memopv4i64, f256mem, v8f64, v8i32, + SSEPackedDouble>, EVEX_V512, XS, + EVEX_CD8<32, CD8VH>; + +defm VCVTUDQ2PSZ : avx512_vcvt_fp<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, + memopv16i32, f512mem, v16f32, v16i32, + SSEPackedSingle>, EVEX_V512, XD, + EVEX_CD8<32, CD8VF>; + +def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + + +def : Pat<(int_x86_avx512_cvtdq2_ps_512 VR512:$src), + (VCVTDQ2PSZrr VR512:$src)>; +def : Pat<(int_x86_avx512_cvtdq2_ps_512 (bitconvert (memopv8i64 addr:$src))), + (VCVTDQ2PSZrm addr:$src)>; + +def VCVTPS2DQZrr : AVX512BI<0x5B, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + "vcvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, + (int_x86_avx512_cvt_ps2dq_512 VR512:$src))], + IIC_SSE_CVT_PS_RR>, EVEX, EVEX_V512; +def VCVTPS2DQZrm : AVX512BI<0x5B, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), + "vcvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, + (int_x86_avx512_cvt_ps2dq_512 (memopv16f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + + +let Predicates = [HasAVX512] in { + def : Pat<(v8f32 (fround (loadv8f64 addr:$src))), + (VCVTPD2PSZrm addr:$src)>; + def : Pat<(v8f64 (extloadv8f32 addr:$src)), + (VCVTPS2PDZrm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// Half precision conversion instructions +//===----------------------------------------------------------------------===// +multiclass avx512_f16c_ph2ps<RegisterClass destRC, RegisterClass srcRC, + X86MemOperand x86memop, Intrinsic Int> { + def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set destRC:$dst, (Int srcRC:$src))]>, EVEX; + let neverHasSideEffects = 1, mayLoad = 1 in + def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX; +} + +multiclass avx512_f16c_ps2ph<RegisterClass destRC, RegisterClass srcRC, + X86MemOperand x86memop, Intrinsic Int> { + def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), + (ins srcRC:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set destRC:$dst, (Int srcRC:$src1, imm:$src2))]>, EVEX; + let neverHasSideEffects = 1, mayStore = 1 in + def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; +} + +defm VCVTPH2PSZ : avx512_f16c_ph2ps<VR512, VR256X, f256mem, + int_x86_avx512_vcvtph2ps_512>, EVEX_V512, + EVEX_CD8<32, CD8VH>; +defm VCVTPS2PHZ : avx512_f16c_ps2ph<VR256X, VR512, f256mem, + int_x86_avx512_vcvtps2ph_512>, EVEX_V512, + EVEX_CD8<32, CD8VH>; + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, + "ucomiss{z}">, TB, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64, + "ucomisd{z}">, TB, OpSize, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + let Pattern = []<dag> in { + defm VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load, + "comiss{z}">, TB, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load, + "comisd{z}">, TB, OpSize, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + } + defm Int_VUCOMISSZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem, + load, "ucomiss">, TB, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm Int_VUCOMISDZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem, + load, "ucomisd">, TB, OpSize, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + + defm Int_VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem, + load, "comiss">, TB, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm Int_VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem, + load, "comisd">, TB, OpSize, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +/// avx512_unop_p - AVX-512 unops in packed form. +multiclass avx512_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> { + def PSZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + !strconcat(OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))]>, + EVEX, EVEX_V512; + def PSZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + def PDZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + !strconcat(OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))]>, + EVEX, EVEX_V512, VEX_W; + def PDZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), + !strconcat(OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} + +/// avx512_fp_unop_p_int - AVX-512 intrinsics unops in packed forms. +multiclass avx512_fp_unop_p_int<bits<8> opc, string OpcodeStr, + Intrinsic V16F32Int, Intrinsic V8F64Int> { + def PSZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + !strconcat(OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (V16F32Int VR512:$src))]>, + EVEX, EVEX_V512; + def PSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), + !strconcat(OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, + (V16F32Int (memopv16f32 addr:$src)))]>, EVEX, + EVEX_V512, EVEX_CD8<32, CD8VF>; + def PDZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + !strconcat(OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (V8F64Int VR512:$src))]>, + EVEX, EVEX_V512, VEX_W; + def PDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), + !strconcat(OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, + (V8F64Int (memopv8f64 addr:$src)))]>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} + +/// avx512_fp_unop_s - AVX-512 unops in scalar form. +multiclass avx512_fp_unop_s<bits<8> opc, string OpcodeStr> { + let hasSideEffects = 0 in { + def SSZr : AVX5128I<opc, MRMSrcReg, (outs FR32X:$dst), + (ins FR32X:$src1, FR32X:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, EVEX_4V; + let mayLoad = 1 in { + def SSZm : AVX5128I<opc, MRMSrcMem, (outs FR32X:$dst), + (ins FR32X:$src1, f32mem:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, EVEX_4V, EVEX_CD8<32, CD8VT1>; + def SSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, ssmem:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, EVEX_4V, EVEX_CD8<32, CD8VT1>; + } + def SDZr : AVX5128I<opc, MRMSrcReg, (outs FR64X:$dst), + (ins FR64X:$src1, FR64X:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + EVEX_4V, VEX_W; + let mayLoad = 1 in { + def SDZm : AVX5128I<opc, MRMSrcMem, (outs FR64X:$dst), + (ins FR64X:$src1, f64mem:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; + def SDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, sdmem:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; + } +} +} + +defm VRCP14 : avx512_fp_unop_s<0x4D, "vrcp14">, + avx512_fp_unop_p<0x4C, "vrcp14", X86frcp>, + avx512_fp_unop_p_int<0x4C, "vrcp14", + int_x86_avx512_rcp14_ps_512, int_x86_avx512_rcp14_pd_512>; + +defm VRSQRT14 : avx512_fp_unop_s<0x4F, "vrsqrt14">, + avx512_fp_unop_p<0x4E, "vrsqrt14", X86frsqrt>, + avx512_fp_unop_p_int<0x4E, "vrsqrt14", + int_x86_avx512_rsqrt14_ps_512, int_x86_avx512_rsqrt14_pd_512>; + +def : Pat<(int_x86_avx512_rsqrt14_ss VR128X:$src), + (COPY_TO_REGCLASS (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; +def : Pat<(int_x86_avx512_rsqrt14_ss sse_load_f32:$src), + (VRSQRT14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + +def : Pat<(int_x86_avx512_rcp14_ss VR128X:$src), + (COPY_TO_REGCLASS (VRCP14SSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; +def : Pat<(int_x86_avx512_rcp14_ss sse_load_f32:$src), + (VRCP14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + +let AddedComplexity = 20, Predicates = [HasERI] in { +defm VRCP28 : avx512_fp_unop_s<0xCB, "vrcp28">, + avx512_fp_unop_p<0xCA, "vrcp28", X86frcp>, + avx512_fp_unop_p_int<0xCA, "vrcp28", + int_x86_avx512_rcp28_ps_512, int_x86_avx512_rcp28_pd_512>; + +defm VRSQRT28 : avx512_fp_unop_s<0xCD, "vrsqrt28">, + avx512_fp_unop_p<0xCC, "vrsqrt28", X86frsqrt>, + avx512_fp_unop_p_int<0xCC, "vrsqrt28", + int_x86_avx512_rsqrt28_ps_512, int_x86_avx512_rsqrt28_pd_512>; +} + +let Predicates = [HasERI] in { + def : Pat<(int_x86_avx512_rsqrt28_ss VR128X:$src), + (COPY_TO_REGCLASS (VRSQRT28SSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; + def : Pat<(int_x86_avx512_rsqrt28_ss sse_load_f32:$src), + (VRSQRT28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + + def : Pat<(int_x86_avx512_rcp28_ss VR128X:$src), + (COPY_TO_REGCLASS (VRCP28SSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; + def : Pat<(int_x86_avx512_rcp28_ss sse_load_f32:$src), + (VRCP28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; +} +multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + Intrinsic V16F32Int, Intrinsic V8F64Int, + OpndItins itins_s, OpndItins itins_d> { + def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>, + EVEX, EVEX_V512; + + let mayLoad = 1 in + def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, + (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))], + itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; + + def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>, + EVEX, EVEX_V512; + + let mayLoad = 1 in + def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (OpNode + (v8f64 (bitconvert (memopv16f32 addr:$src)))))], + itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; + + def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + !strconcat(OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (V16F32Int VR512:$src))]>, + EVEX, EVEX_V512; + def PSZm_Int : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, + (V16F32Int (memopv16f32 addr:$src)))]>, EVEX, + EVEX_V512, EVEX_CD8<32, CD8VF>; + def PDZr_Int : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (V8F64Int VR512:$src))]>, + EVEX, EVEX_V512, VEX_W; + def PDZm_Int : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), + !strconcat(OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>, + EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, + Intrinsic F32Int, Intrinsic F64Int, + OpndItins itins_s, OpndItins itins_d> { + def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst), + (ins FR32X:$src1, FR32X:$src2), + !strconcat(OpcodeStr, + "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], itins_s.rr>, XS, EVEX_4V; + def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + !strconcat(OpcodeStr, + "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128X:$dst, + (F32Int VR128X:$src1, VR128X:$src2))], + itins_s.rr>, XS, EVEX_4V; + let mayLoad = 1 in { + def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst), + (ins FR32X:$src1, f32mem:$src2), + !strconcat(OpcodeStr, + "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; + def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, ssmem:$src2), + !strconcat(OpcodeStr, + "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128X:$dst, + (F32Int VR128X:$src1, sse_load_f32:$src2))], + itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; + } + def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst), + (ins FR64X:$src1, FR64X:$src2), + !strconcat(OpcodeStr, + "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + XD, EVEX_4V, VEX_W; + def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + !strconcat(OpcodeStr, + "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128X:$dst, + (F64Int VR128X:$src1, VR128X:$src2))], + itins_s.rr>, XD, EVEX_4V, VEX_W; + let mayLoad = 1 in { + def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst), + (ins FR64X:$src1, f64mem:$src2), + !strconcat(OpcodeStr, + "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; + def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, sdmem:$src2), + !strconcat(OpcodeStr, + "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128X:$dst, + (F64Int VR128X:$src1, sse_load_f64:$src2))]>, + XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; + } +} + + +defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", + int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, + SSE_SQRTSS, SSE_SQRTSD>, + avx512_sqrt_packed<0x51, "vsqrt", fsqrt, + int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512, + SSE_SQRTPS, SSE_SQRTPD>; + +let Predicates = [HasAVX512] in { + def : Pat<(f32 (fsqrt FR32X:$src)), + (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + def : Pat<(f32 (fsqrt (load addr:$src))), + (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + def : Pat<(f64 (fsqrt FR64X:$src)), + (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>; + def : Pat<(f64 (fsqrt (load addr:$src))), + (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + + def : Pat<(f32 (X86frsqrt FR32X:$src)), + (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + def : Pat<(f32 (X86frsqrt (load addr:$src))), + (VRSQRT14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + + def : Pat<(f32 (X86frcp FR32X:$src)), + (VRCP14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + def : Pat<(f32 (X86frcp (load addr:$src))), + (VRCP14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + + def : Pat<(int_x86_sse_sqrt_ss VR128X:$src), + (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; + def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), + (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + + def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src), + (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR64)), + VR128X)>; + def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), + (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; +} + + +multiclass avx512_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + PatFrag mem_frag32, PatFrag mem_frag64, + Intrinsic V4F32Int, Intrinsic V2F64Int, + CD8VForm VForm> { +let ExeDomain = SSEPackedSingle in { + // Intrinsic operation, reg. + // Vector intrinsic operation, reg + def PSr : AVX512AIi8<opcps, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>; + + // Vector intrinsic operation, mem + def PSm : AVX512AIi8<opcps, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>, + EVEX_CD8<32, VForm>; +} // ExeDomain = SSEPackedSingle + +let ExeDomain = SSEPackedDouble in { + // Vector intrinsic operation, reg + def PDr : AVX512AIi8<opcpd, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>; + + // Vector intrinsic operation, mem + def PDm : AVX512AIi8<opcpd, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>, + EVEX_CD8<64, VForm>; +} // ExeDomain = SSEPackedDouble +} + +multiclass avx512_fp_binop_rm<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, + Intrinsic F32Int, + Intrinsic F64Int> { +let ExeDomain = GenericDomain in { + // Operation, reg. + let hasSideEffects = 0 in + def SSr : AVX512AIi8<opcss, MRMSrcReg, + (outs FR32X:$dst), (ins FR32X:$src1, FR32X:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>; + + // Intrinsic operation, reg. + def SSr_Int : AVX512AIi8<opcss, MRMSrcReg, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2, imm:$src3))]>; + + // Intrinsic operation, mem. + def SSm : AVX512AIi8<opcss, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128X:$dst, (F32Int VR128X:$src1, + sse_load_f32:$src2, imm:$src3))]>, + EVEX_CD8<32, CD8VT1>; + + // Operation, reg. + let hasSideEffects = 0 in + def SDr : AVX512AIi8<opcsd, MRMSrcReg, + (outs FR64X:$dst), (ins FR64X:$src1, FR64X:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_W; + + // Intrinsic operation, reg. + def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2, imm:$src3))]>, + VEX_W; + + // Intrinsic operation, mem. + def SDm : AVX512AIi8<opcsd, MRMSrcMem, + (outs VR128X:$dst), (ins VR128X:$src1, sdmem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128X:$dst, + (F64Int VR128X:$src1, sse_load_f64:$src2, imm:$src3))]>, + VEX_W, EVEX_CD8<64, CD8VT1>; +} // ExeDomain = GenericDomain +} + +let Predicates = [HasAVX512] in { + defm VRNDSCALE : avx512_fp_binop_rm<0x0A, 0x0B, "vrndscale", + int_x86_avx512_rndscale_ss, + int_x86_avx512_rndscale_sd>, EVEX_4V; + + defm VRNDSCALEZ : avx512_fp_unop_rm<0x08, 0x09, "vrndscale", f256mem, VR512, + memopv16f32, memopv8f64, + int_x86_avx512_rndscale_ps_512, + int_x86_avx512_rndscale_pd_512, CD8VF>, + EVEX, EVEX_V512; +} + +def : Pat<(ffloor FR32X:$src), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>; +def : Pat<(f64 (ffloor FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>; +def : Pat<(f32 (fnearbyint FR32X:$src)), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>; +def : Pat<(f64 (fnearbyint FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>; +def : Pat<(f32 (fceil FR32X:$src)), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>; +def : Pat<(f64 (fceil FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>; +def : Pat<(f32 (frint FR32X:$src)), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>; +def : Pat<(f64 (frint FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>; +def : Pat<(f32 (ftrunc FR32X:$src)), + (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>; +def : Pat<(f64 (ftrunc FR64X:$src)), + (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>; + +def : Pat<(v16f32 (ffloor VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0x1))>; +def : Pat<(v16f32 (fnearbyint VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0xC))>; +def : Pat<(v16f32 (fceil VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0x2))>; +def : Pat<(v16f32 (frint VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0x4))>; +def : Pat<(v16f32 (ftrunc VR512:$src)), + (VRNDSCALEZPSr VR512:$src, (i32 0x3))>; + +def : Pat<(v8f64 (ffloor VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0x1))>; +def : Pat<(v8f64 (fnearbyint VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0xC))>; +def : Pat<(v8f64 (fceil VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0x2))>; +def : Pat<(v8f64 (frint VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0x4))>; +def : Pat<(v8f64 (ftrunc VR512:$src)), + (VRNDSCALEZPDr VR512:$src, (i32 0x3))>; + +//------------------------------------------------- +// Integer truncate and extend operations +//------------------------------------------------- + +multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, + RegisterClass dstRC, RegisterClass srcRC, + RegisterClass KRC, X86MemOperand x86memop> { + def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), + (ins srcRC:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), + []>, EVEX; + + def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), + (ins KRC:$mask, srcRC:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + []>, EVEX, EVEX_KZ; + + def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX; +} +defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, + i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; +defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, + i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; +defm VPMOVUSQB : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM, + i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; +defm VPMOVQW : avx512_trunc_sat<0x34, "vpmovqw", VR128X, VR512, VK8WM, + i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; +defm VPMOVSQW : avx512_trunc_sat<0x24, "vpmovsqw", VR128X, VR512, VK8WM, + i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; +defm VPMOVUSQW : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM, + i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; +defm VPMOVQD : avx512_trunc_sat<0x35, "vpmovqd", VR256X, VR512, VK8WM, + i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; +defm VPMOVSQD : avx512_trunc_sat<0x25, "vpmovsqd", VR256X, VR512, VK8WM, + i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; +defm VPMOVUSQD : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM, + i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; +defm VPMOVDW : avx512_trunc_sat<0x33, "vpmovdw", VR256X, VR512, VK16WM, + i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; +defm VPMOVSDW : avx512_trunc_sat<0x23, "vpmovsdw", VR256X, VR512, VK16WM, + i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; +defm VPMOVUSDW : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM, + i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; +defm VPMOVDB : avx512_trunc_sat<0x31, "vpmovdb", VR128X, VR512, VK16WM, + i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; +defm VPMOVSDB : avx512_trunc_sat<0x21, "vpmovsdb", VR128X, VR512, VK16WM, + i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; +defm VPMOVUSDB : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM, + i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; + +def : Pat<(v16i8 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQBrr VR512:$src)>; +def : Pat<(v8i16 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQWrr VR512:$src)>; +def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr VR512:$src)>; +def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>; +def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>; + +def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), + (VPMOVDBkrr VK16WM:$mask, VR512:$src)>; +def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), + (VPMOVDWkrr VK16WM:$mask, VR512:$src)>; +def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), + (VPMOVQWkrr VK8WM:$mask, VR512:$src)>; +def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), + (VPMOVQDkrr VK8WM:$mask, VR512:$src)>; + + +multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC, + RegisterClass SrcRC, SDNode OpNode, PatFrag mem_frag, + X86MemOperand x86memop, ValueType OpVT, ValueType InVT> { + + def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), + (ins SrcRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX; + def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), + (ins x86memop:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, + (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>, + EVEX; +} + +defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VR512, VR128X, X86vzext, + memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, + EVEX_CD8<8, CD8VQ>; +defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VR512, VR128X, X86vzext, + memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, + EVEX_CD8<8, CD8VO>; +defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VR512, VR256X, X86vzext, + memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, + EVEX_CD8<16, CD8VH>; +defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VR512, VR128X, X86vzext, + memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, + EVEX_CD8<16, CD8VQ>; +defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VR512, VR256X, X86vzext, + memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, + EVEX_CD8<32, CD8VH>; + +defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VR512, VR128X, X86vsext, + memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, + EVEX_CD8<8, CD8VQ>; +defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VR512, VR128X, X86vsext, + memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, + EVEX_CD8<8, CD8VO>; +defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VR512, VR256X, X86vsext, + memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, + EVEX_CD8<16, CD8VH>; +defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VR512, VR128X, X86vsext, + memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, + EVEX_CD8<16, CD8VQ>; +defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext, + memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, + EVEX_CD8<32, CD8VH>; + +//===----------------------------------------------------------------------===// +// GATHER - SCATTER Operations + +multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand memop> { +let mayLoad = 1, + Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb), + (ins RC:$src1, KRC:$mask, memop:$src2), + !strconcat(OpcodeStr, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + []>, EVEX, EVEX_K; +} +defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand memop> { +let mayStore = 1, Constraints = "$mask = $mask_wb" in + def mr : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb), + (ins memop:$dst, KRC:$mask, RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + []>, EVEX, EVEX_K; +} + +defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; + +//===----------------------------------------------------------------------===// +// VSHUFPS - VSHUFPD Operations + +multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop, + ValueType vt, string OpcodeStr, PatFrag mem_frag, + Domain d> { + def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), + (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, + EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; + def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, + EVEX_4V, Sched<[WriteShuffle]>; +} + +defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32, + SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64, + SSEPackedDouble>, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + +def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>; +def : Pat<(v16i32 (X86Shufp VR512:$src1, + (memopv16i32 addr:$src2), (i8 imm:$imm))), + (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>; + +def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>; +def : Pat<(v8i64 (X86Shufp VR512:$src1, + (memopv8i64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; + +multiclass avx512_alignr<string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop> { + def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, EVEX_4V; + let mayLoad = 1 in + def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, EVEX_4V; +} +defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + +def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; +def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; +def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; + +multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop> { + def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, + EVEX; + def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), + (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, + EVEX; +} + +defm VPABSD : avx512_vpabs<0x1E, "vpabsd", VR512, i512mem>, EVEX_V512, + EVEX_CD8<32, CD8VF>; +defm VPABSQ : avx512_vpabs<0x1F, "vpabsq", VR512, i512mem>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VF>; + +multiclass avx512_conflict<bits<8> opc, string OpcodeStr, + RegisterClass RC, RegisterClass KRC, PatFrag memop_frag, + X86MemOperand x86memop, PatFrag scalar_mfrag, + X86MemOperand x86scalar_mop, string BrdcstStr, + Intrinsic Int, Intrinsic maskInt, Intrinsic maskzInt> { + def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"), + [(set RC:$dst, (Int RC:$src))]>, EVEX; + def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"), + [(set RC:$dst, (Int (memop_frag addr:$src)))]>, EVEX; + def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins x86scalar_mop:$src), + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, + ", ${dst}|${dst}, ${src}", BrdcstStr, "}"), + []>, EVEX, EVEX_B; + def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins KRC:$mask, RC:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + [(set RC:$dst, (maskzInt KRC:$mask, RC:$src))]>, EVEX, EVEX_KZ; + def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins KRC:$mask, x86memop:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + [(set RC:$dst, (maskzInt KRC:$mask, (memop_frag addr:$src)))]>, + EVEX, EVEX_KZ; + def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins KRC:$mask, x86scalar_mop:$src), + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, + ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}", + BrdcstStr, "}"), + []>, EVEX, EVEX_KZ, EVEX_B; + + let Constraints = "$src1 = $dst" in { + def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, RC:$src2))]>, EVEX, EVEX_K; + def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, x86memop:$src2), + !strconcat(OpcodeStr, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, (memop_frag addr:$src2)))]>, EVEX, EVEX_K; + def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2), + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, + ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"), + []>, EVEX, EVEX_K, EVEX_B; + } +} + +let Predicates = [HasCDI] in { +defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM, + memopv16i32, i512mem, loadi32, i32mem, "{1to16}", + int_x86_avx512_conflict_d_512, + int_x86_avx512_conflict_d_mask_512, + int_x86_avx512_conflict_d_maskz_512>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM, + memopv8i64, i512mem, loadi64, i64mem, "{1to8}", + int_x86_avx512_conflict_q_512, + int_x86_avx512_conflict_q_mask_512, + int_x86_avx512_conflict_q_maskz_512>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td index 225e972..7fc9c443 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -294,7 +294,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 // unsigned division/remainder let hasSideEffects = 1 in { // so that we don't speculatively execute let SchedRW = [WriteIDiv] in { -let Defs = [AL,EFLAGS,AX], Uses = [AX] in +let Defs = [AL,AH,EFLAGS], Uses = [AX] in def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH "div{b}\t$src", [], IIC_DIV8_REG>; let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in @@ -310,7 +310,7 @@ def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src), } // SchedRW let mayLoad = 1 in { -let Defs = [AL,EFLAGS,AX], Uses = [AX] in +let Defs = [AL,AH,EFLAGS], Uses = [AX] in def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH "div{b}\t$src", [], IIC_DIV8_MEM>, SchedLoadReg<WriteIDivLd>; @@ -331,7 +331,7 @@ def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), // Signed division/remainder. let SchedRW = [WriteIDiv] in { -let Defs = [AL,EFLAGS,AX], Uses = [AX] in +let Defs = [AL,AH,EFLAGS], Uses = [AX] in def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH "idiv{b}\t$src", [], IIC_IDIV8>; let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in @@ -347,7 +347,7 @@ def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src), } // SchedRW let mayLoad = 1 in { -let Defs = [AL,EFLAGS,AX], Uses = [AX] in +let Defs = [AL,AH,EFLAGS], Uses = [AX] in def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH "idiv{b}\t$src", [], IIC_IDIV8>, SchedLoadReg<WriteIDivLd>; @@ -497,6 +497,21 @@ def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), Requires<[In64BitMode]>; } // isConvertibleToThreeAddress = 1, CodeSize = 2 +let isCodeGenOnly = 1, CodeSize = 2 in { +def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", [], IIC_UNARY_REG>, + OpSize, Requires<[In32BitMode]>; +def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", [], IIC_UNARY_REG>, + Requires<[In32BitMode]>; +def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", [], IIC_UNARY_REG>, + OpSize, Requires<[In32BitMode]>; +def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", [], IIC_UNARY_REG>, + Requires<[In32BitMode]>; +} // isCodeGenOnly = 1, CodeSize = 2 + } // Constraints = "$src1 = $dst", SchedRW let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { @@ -578,7 +593,6 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { } // CodeSize = 2, SchedRW } // Defs = [EFLAGS] - /// X86TypeInfo - This is a bunch of information that describes relevant X86 /// information about value types. For example, it can tell you what the /// register class and preferred load to use. @@ -726,20 +740,25 @@ class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2, - EFLAGS))], IIC_BIN_NONMEM>; + EFLAGS))], IIC_BIN_CARRY_NONMEM>; // BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding). -class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> +class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + InstrItinClass itin = IIC_BIN_NONMEM> : ITy<opcode, MRMSrcReg, typeinfo, (outs typeinfo.RegClass:$dst), (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), - mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM>, + mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>, Sched<[WriteALU]> { // The disassembler should know about this, but not the asmparser. let isCodeGenOnly = 1; let hasSideEffects = 0; } +// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding). +class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>; + // BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding). class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> : ITy<opcode, MRMSrcReg, typeinfo, (outs), @@ -753,10 +772,11 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> // BinOpRM - Instructions like "add reg, reg, [mem]". class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - dag outlist, list<dag> pattern> + dag outlist, list<dag> pattern, + InstrItinClass itin = IIC_BIN_MEM> : ITy<opcode, MRMSrcMem, typeinfo, outlist, (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2), - mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>, + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, Sched<[WriteALULd, ReadAfterLd]>; // BinOpRM_R - Instructions like "add reg, reg, [mem]". @@ -786,14 +806,15 @@ class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2), - EFLAGS))]>; + EFLAGS))], IIC_BIN_CARRY_MEM>; // BinOpRI - Instructions like "add reg, reg, imm". class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - Format f, dag outlist, list<dag> pattern> + Format f, dag outlist, list<dag> pattern, + InstrItinClass itin = IIC_BIN_NONMEM> : ITy<opcode, f, typeinfo, outlist, (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2), - mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>, + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, Sched<[WriteALU]> { let ImmT = typeinfo.ImmEncoding; } @@ -824,14 +845,15 @@ class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2, - EFLAGS))]>; + EFLAGS))], IIC_BIN_CARRY_NONMEM>; // BinOpRI8 - Instructions like "add reg, reg, imm8". class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - Format f, dag outlist, list<dag> pattern> + Format f, dag outlist, list<dag> pattern, + InstrItinClass itin = IIC_BIN_NONMEM> : ITy<opcode, f, typeinfo, outlist, (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2), - mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>, + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, Sched<[WriteALU]> { let ImmT = Imm8; // Always 8-bit immediate. } @@ -863,14 +885,14 @@ class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2, - EFLAGS))]>; + EFLAGS))], IIC_BIN_CARRY_NONMEM>; // BinOpMR - Instructions like "add [mem], reg". class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - list<dag> pattern> + list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM> : ITy<opcode, MRMDestMem, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src), - mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>, + mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>, Sched<[WriteALULd, WriteRMW]>; // BinOpMR_RMW - Instructions like "add [mem], reg". @@ -886,7 +908,7 @@ class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, : BinOpMR<opcode, mnemonic, typeinfo, [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_BIN_CARRY_MEM>; // BinOpMR_F - Instructions like "cmp [mem], reg". class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, @@ -896,10 +918,11 @@ class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, // BinOpMI - Instructions like "add [mem], imm". class BinOpMI<string mnemonic, X86TypeInfo typeinfo, - Format f, list<dag> pattern, bits<8> opcode = 0x80> + Format f, list<dag> pattern, bits<8> opcode = 0x80, + InstrItinClass itin = IIC_BIN_MEM> : ITy<opcode, f, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src), - mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>, + mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>, Sched<[WriteALULd, WriteRMW]> { let ImmT = typeinfo.ImmEncoding; } @@ -917,7 +940,7 @@ class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, : BinOpMI<mnemonic, typeinfo, f, [(store (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], 0x80, IIC_BIN_CARRY_MEM>; // BinOpMI_F - Instructions like "cmp [mem], imm". class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo, @@ -929,10 +952,11 @@ class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo, // BinOpMI8 - Instructions like "add [mem], imm8". class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, - Format f, list<dag> pattern> + Format f, list<dag> pattern, + InstrItinClass itin = IIC_BIN_MEM> : ITy<0x82, f, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src), - mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>, + mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>, Sched<[WriteALULd, WriteRMW]> { let ImmT = Imm8; // Always 8-bit immediate. } @@ -951,7 +975,7 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, : BinOpMI8<mnemonic, typeinfo, f, [(store (opnode (load addr:$dst), typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst), - (implicit EFLAGS)]>; + (implicit EFLAGS)], IIC_BIN_CARRY_MEM>; // BinOpMI8_F - Instructions like "cmp [mem], imm8". class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, @@ -960,18 +984,28 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, [(set EFLAGS, (opnode (load addr:$dst), typeinfo.Imm8Operator:$src))]>; -// BinOpAI - Instructions like "add %eax, %eax, imm". +// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS. class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - Register areg, string operands> + Register areg, string operands, + InstrItinClass itin = IIC_BIN_NONMEM> : ITy<opcode, RawFrm, typeinfo, (outs), (ins typeinfo.ImmOperand:$src), - mnemonic, operands, []>, Sched<[WriteALU]> { + mnemonic, operands, [], itin>, Sched<[WriteALU]> { let ImmT = typeinfo.ImmEncoding; let Uses = [areg]; - let Defs = [areg]; + let Defs = [areg, EFLAGS]; let hasSideEffects = 0; } +// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define +// and use EFLAGS. +class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, + IIC_BIN_CARRY_NONMEM> { + let Uses = [areg, EFLAGS]; +} + /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is /// defined with "(set GPR:$dst, EFLAGS, (...". /// @@ -1030,16 +1064,16 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; - - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + } // Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is @@ -1052,7 +1086,7 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, string mnemonic, Format RegMRM, Format MemMRM, SDNode opnode, bit CommutableRR, bit ConvertibleToThreeAddress> { - let Defs = [EFLAGS] in { + let Uses = [EFLAGS], Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { let isCommutable = CommutableRR, isConvertibleToThreeAddress = ConvertibleToThreeAddress in { @@ -1062,10 +1096,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; } // isCommutable - def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; - def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; - def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; - def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>; def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; @@ -1101,16 +1135,16 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; - - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + } // Uses = [EFLAGS], Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is @@ -1168,16 +1202,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>; - - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|AL, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|AX, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|EAX, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|RAX, $src}">; - } + } // Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } @@ -1195,12 +1229,10 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, } // Arithmetic. -let Uses = [EFLAGS] in { - defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, - 1, 0>; - defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, - 0, 0>; -} +defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, + 1, 0>; +defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, + 0, 0>; let isCompare = 1 in { defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; @@ -1215,44 +1247,46 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; def X86testpat : PatFrag<(ops node:$lhs, node:$rhs), (X86cmp (and_su node:$lhs, node:$rhs), 0)>; -let isCompare = 1, Defs = [EFLAGS] in { - let isCommutable = 1 in { - def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>; - def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>; - def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>; - def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>; - } // isCommutable - - def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; - def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>; - def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>; - def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>; - - def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; - def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; - def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; - def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; - - def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>; - def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; - def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; - def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; +let isCompare = 1 in { + let Defs = [EFLAGS] in { + let isCommutable = 1 in { + def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>; + def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>; + def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>; + def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>; + } // isCommutable + + def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; + def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>; + def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>; + def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>; + + def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; + def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; + def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; + def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; + + def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>; + def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; + def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; + def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; + + // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the + // register class is constrained to GR8_NOREX. + let isPseudo = 1 in + def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), + "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; + } // Defs = [EFLAGS] def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, - "{$src, %al|AL, $src}">; + "{$src, %al|al, $src}">; def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, - "{$src, %ax|AX, $src}">; + "{$src, %ax|ax, $src}">; def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX, - "{$src, %eax|EAX, $src}">; + "{$src, %eax|eax, $src}">; def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX, - "{$src, %rax|RAX, $src}">; - - // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the - // register class is constrained to GR8_NOREX. - let isPseudo = 1 in - def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), - "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; -} + "{$src, %rax|rax, $src}">; +} // isCompare //===----------------------------------------------------------------------===// // ANDN Instruction @@ -1294,12 +1328,12 @@ let neverHasSideEffects = 1 in { let isCommutable = 1 in def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), - [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul]>; + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>; let mayLoad = 1 in def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), - [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd]>; + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>; } } @@ -1328,7 +1362,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8, OpSize; - + def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>; @@ -1353,7 +1387,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; - + def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index d9ff0c6..7d10b67 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -129,12 +129,13 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), // The MSVC runtime contains an _ftol2 routine for converting floating-point // to integer values. It has a strange calling convention: the input is -// popped from the x87 stack, and the return value is given in EDX:EAX. No -// other registers (aside from flags) are touched. +// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is +// used as a temporary register. No other registers (aside from flags) are +// touched. // Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 // variant is unnecessary. -let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in { +let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), "# win32 fptoui", [(X86WinFTOL RFP32:$src)]>, @@ -216,48 +217,38 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), // Alias Instructions //===----------------------------------------------------------------------===// -// Alias instructions that map movr0 to xor. +// Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. // FIXME: Set encoding to pseudo. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isCodeGenOnly = 1 in { -def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "", - [(set GR8:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; - -// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller -// encoding and avoids a partial-register update sometimes, but doing so -// at isel time interferes with rematerialization in the current register -// allocator. For now, this is rewritten when the instruction is lowered -// to an MCInst. -def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins), - "", - [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize, - Sched<[WriteZero]>; - -// FIXME: Set encoding to pseudo. + isCodeGenOnly = 1 in def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; -} -// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a -// smaller encoding, but doing so at isel time interferes with rematerialization -// in the current register allocator. For now, this is rewritten when the -// instruction is lowered to an MCInst. -// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove -// when we have a better way to specify isel priority. -let Defs = [EFLAGS], isCodeGenOnly=1, - AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "", - [(set GR64:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; +// Other widths can also make use of the 32-bit xor, which may have a smaller +// encoding and avoid partial register updates. +def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; +def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; +def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { + let AddedComplexity = 20; +} // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, - isCodeGenOnly = 1 in -def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src), - "", [(set GR64:$dst, i64immZExt32:$src)], - IIC_ALU_NONMEM>, Sched<[WriteALU]>; + isCodeGenOnly = 1, neverHasSideEffects = 1 in +def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), + "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; + +// This 64-bit pseudo-move can be used for both a 64-bit constant that is +// actually the zero-extension of a 32-bit constant, and for labels in the +// x86-64 small code model. +def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>; + +let AddedComplexity = 1 in +def : Pat<(i64 mov64imm32:$src), + (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>; // Use sbb to materialize carry bit. let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { @@ -893,6 +884,24 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in { [(set VR256:$dst, (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)))]>; + def CMOV_V8I64 : I<0, Pseudo, + (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + "#CMOV_V8I64 PSEUDO!", + [(set VR512:$dst, + (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V8F64 : I<0, Pseudo, + (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + "#CMOV_V8F64 PSEUDO!", + [(set VR512:$dst, + (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V16F32 : I<0, Pseudo, + (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + "#CMOV_V16F32 PSEUDO!", + [(set VR512:$dst, + (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, + EFLAGS)))]>; } @@ -926,8 +935,6 @@ def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), (MOV32mi addr:$dst, tblockaddress:$src)>; - - // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small // code model mode, should use 'movabs'. FIXME: This is really a hack, the // 'movabs' predicate should handle this sort of thing. @@ -942,20 +949,6 @@ def : Pat<(i64 (X86Wrapper texternalsym:$dst)), def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; -// In static codegen with small code model, we can get the address of a label -// into a register with 'movl'. FIXME: This is a hack, the 'imm' predicate of -// the MOV64ri64i32 should accept these. -def : Pat<(i64 (X86Wrapper tconstpool :$dst)), - (MOV64ri64i32 tconstpool :$dst)>, Requires<[SmallCode]>; -def : Pat<(i64 (X86Wrapper tjumptable :$dst)), - (MOV64ri64i32 tjumptable :$dst)>, Requires<[SmallCode]>; -def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), - (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>; -def : Pat<(i64 (X86Wrapper texternalsym:$dst)), - (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>; -def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), - (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>; - // In kernel code model, we can get the address of a label // into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of // the MOV64ri32 should accept these. @@ -989,14 +982,12 @@ def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), (MOV64mi32 addr:$dst, tblockaddress:$src)>, Requires<[NearData, IsStatic]>; - - // Calls // tls has some funny stuff here... // This corresponds to movabs $foo@tpoff, %rax def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), - (MOV64ri tglobaltlsaddr :$dst)>; + (MOV64ri32 tglobaltlsaddr :$dst)>; // This corresponds to add $foo@tpoff, %rax def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; @@ -1119,7 +1110,8 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; -def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; +def : Pat<(zextloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; // extload bool -> extload byte // When extloading from 16-bit and smaller memory locations into 64-bit @@ -1133,14 +1125,16 @@ def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; -def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; -def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>; -def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>; // For other extloads, use subregs, since the high contents of the register are // defined after an extload. +def : Pat<(extloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; def : Pat<(extloadi64i32 addr:$src), - (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), - sub_32bit)>; + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; // anyext. Define these to do an explicit zero-extend to // avoid partial-register updates. @@ -1152,8 +1146,10 @@ def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; def : Pat<(i32 (anyext GR16:$src)), (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; -def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>; -def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>; +def : Pat<(i64 (anyext GR8 :$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; def : Pat<(i64 (anyext GR32:$src)), (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; @@ -1318,13 +1314,19 @@ def : Pat<(and GR16:$src1, 0xff), // r & (2^32-1) ==> movz def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), - (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; + (SUBREG_TO_REG (i64 0), + (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), + sub_32bit)>; // r & (2^16-1) ==> movz def : Pat<(and GR64:$src, 0xffff), - (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>; + (SUBREG_TO_REG (i64 0), + (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), + sub_32bit)>; // r & (2^8-1) ==> movz def : Pat<(and GR64:$src, 0xff), - (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>; + (SUBREG_TO_REG (i64 0), + (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), + sub_32bit)>; // r & (2^8-1) ==> movz def : Pat<(and GR32:$src1, 0xff), (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index 0e69651..e4ccc06 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -49,10 +49,12 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>; + let hasSideEffects = 0 in def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), "jmp\t$dst", [], IIC_JMP_REL>; // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious // with JMP_1. + let hasSideEffects = 0 in def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst), "jmpq\t$dst", [], IIC_JMP_REL>; } @@ -60,6 +62,7 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { // Conditional Branches. let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { + let hasSideEffects = 0 in def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [], IIC_Jcc>; def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm, @@ -85,7 +88,7 @@ defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; // jcx/jecx/jrcx instructions. -let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { +let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in { // These are the 32-bit versions of this instruction for the asmparser. In // 32-bit mode, the address size prefix is jcxz and the unprefixed version is // jecxz. diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td index 6dc7175..4090550 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -14,26 +14,26 @@ let neverHasSideEffects = 1 in { let Defs = [AX], Uses = [AL] in def CBW : I<0x98, RawFrm, (outs), (ins), - "{cbtw|cbw}", []>, OpSize; // AX = signext(AL) + "{cbtw|cbw}", [], IIC_CBW>, OpSize; // AX = signext(AL) let Defs = [EAX], Uses = [AX] in def CWDE : I<0x98, RawFrm, (outs), (ins), - "{cwtl|cwde}", []>; // EAX = signext(AX) + "{cwtl|cwde}", [], IIC_CBW>; // EAX = signext(AX) let Defs = [AX,DX], Uses = [AX] in def CWD : I<0x99, RawFrm, (outs), (ins), - "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX) + "{cwtd|cwd}", [], IIC_CBW>, OpSize; // DX:AX = signext(AX) let Defs = [EAX,EDX], Uses = [EAX] in def CDQ : I<0x99, RawFrm, (outs), (ins), - "{cltd|cdq}", []>; // EDX:EAX = signext(EAX) + "{cltd|cdq}", [], IIC_CBW>; // EDX:EAX = signext(EAX) let Defs = [RAX], Uses = [EAX] in def CDQE : RI<0x98, RawFrm, (outs), (ins), - "{cltq|cdqe}", []>; // RAX = signext(EAX) + "{cltq|cdqe}", [], IIC_CBW>; // RAX = signext(EAX) let Defs = [RAX,RDX], Uses = [RAX] in def CQO : RI<0x99, RawFrm, (outs), (ins), - "{cqto|cqo}", []>; // RDX:RAX = signext(RAX) + "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX) } @@ -149,38 +149,24 @@ def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; -// FIXME: These should be Pat patterns. -let isCodeGenOnly = 1 in { - -// Use movzbl instead of movzbq when the destination is a register; it's -// equivalent due to implicit zero-extending, and it has a smaller encoding. -def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), - "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB, - Sched<[WriteALU]>; -def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), - "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>, - TB, Sched<[WriteALULd]>; -// Use movzwl instead of movzwq when the destination is a register; it's -// equivalent due to implicit zero-extending, and it has a smaller encoding. -def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), - "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB, - Sched<[WriteALU]>; -def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), - "", [(set GR64:$dst, (zextloadi64i16 addr:$src))], - IIC_MOVZX>, TB, Sched<[WriteALULd]>; - -// There's no movzlq instruction, but movl can be used for this purpose, using -// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero -// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit -// zero-extension, however this isn't possible when the 32-bit value is -// defined by a truncate or is copied from something where the high bits aren't -// necessarily all zero. In such cases, we fall back to these explicit zext -// instructions. -def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src), - "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>, - Sched<[WriteALU]>; -def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), - "", [(set GR64:$dst, (zextloadi64i32 addr:$src))], - IIC_MOVZX>, Sched<[WriteALULd]>; -} - +// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a +// 32-bit register. +def : Pat<(i64 (zext GR8:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>; +def : Pat<(zextloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; + +def : Pat<(i64 (zext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>; +def : Pat<(zextloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; + +// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a +// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible +// when the 32-bit value is defined by a truncate or is copied from something +// where the high bits aren't necessarily all zero. In such cases, we fall back +// to these explicit zext instructions. +def : Pat<(i64 (zext GR32:$src)), + (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>; +def : Pat<(i64 (zextloadi64i32 addr:$src)), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td index 7759a8a2..69cd5a5 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -74,43 +74,43 @@ let neverHasSideEffects = 1 in { // Fused Multiply-Add let ExeDomain = SSEPackedSingle in { - defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32, - memopv8f32, X86Fmadd, v4f32, v8f32>; - defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32, - memopv8f32, X86Fmsub, v4f32, v8f32>; + defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32, + loadv8f32, X86Fmadd, v4f32, v8f32>; + defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32, + loadv8f32, X86Fmsub, v4f32, v8f32>; defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", - memopv4f32, memopv8f32, X86Fmaddsub, + loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32>; defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", - memopv4f32, memopv8f32, X86Fmsubadd, + loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32>; } let ExeDomain = SSEPackedDouble in { - defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64, - memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W; - defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64, - memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W; + defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64, + loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W; + defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64, + loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W; defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", - memopv2f64, memopv4f64, X86Fmaddsub, + loadv2f64, loadv4f64, X86Fmaddsub, v2f64, v4f64>, VEX_W; defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", - memopv2f64, memopv4f64, X86Fmsubadd, + loadv2f64, loadv4f64, X86Fmsubadd, v2f64, v4f64>, VEX_W; } // Fused Negative Multiply-Add let ExeDomain = SSEPackedSingle in { - defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", memopv4f32, - memopv8f32, X86Fnmadd, v4f32, v8f32>; - defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", memopv4f32, - memopv8f32, X86Fnmsub, v4f32, v8f32>; + defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", loadv4f32, + loadv8f32, X86Fnmadd, v4f32, v8f32>; + defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", loadv4f32, + loadv8f32, X86Fnmsub, v4f32, v8f32>; } let ExeDomain = SSEPackedDouble in { - defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64, - memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W; + defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64, + loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W; defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", - memopv2f64, memopv4f64, X86Fnmsub, v2f64, + loadv2f64, loadv4f64, X86Fnmsub, v2f64, v4f64>, VEX_W; } @@ -206,25 +206,26 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4; + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4; def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3)))]>, VEX_W, MemOp4; + (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4; def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>; + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG; // For disassembler let isCodeGenOnly = 1, hasSideEffects = 0 in def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + VEX_LIG; } multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, @@ -235,19 +236,19 @@ multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4; + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4; def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, - mem_cpat:$src3))]>, VEX_W, MemOp4; + mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4; def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>; + (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG; } multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -338,31 +339,31 @@ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, let ExeDomain = SSEPackedSingle in { defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, - memopv4f32, memopv8f32>; + loadv4f32, loadv8f32>; defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, - memopv4f32, memopv8f32>; + loadv4f32, loadv8f32>; defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, - memopv4f32, memopv8f32>; + loadv4f32, loadv8f32>; defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, - memopv4f32, memopv8f32>; + loadv4f32, loadv8f32>; defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, - memopv4f32, memopv8f32>; + loadv4f32, loadv8f32>; defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32, - memopv4f32, memopv8f32>; + loadv4f32, loadv8f32>; } let ExeDomain = SSEPackedDouble in { defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, - memopv2f64, memopv4f64>; + loadv2f64, loadv4f64>; defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, - memopv2f64, memopv4f64>; + loadv2f64, loadv4f64>; defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, - memopv2f64, memopv4f64>; + loadv2f64, loadv4f64>; defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, - memopv2f64, memopv4f64>; + loadv2f64, loadv4f64>; defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, - memopv2f64, memopv4f64>; + loadv2f64, loadv4f64>; defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64, - memopv2f64, memopv4f64>; + loadv2f64, loadv4f64>; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td index 2224a08..7c37888 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -229,22 +229,22 @@ class FPrST0PInst<bits<8> o, string asm> // of some of the 'reverse' forms of the fsub and fdiv instructions. As such, // we have to put some 'r's in and take them out of weird places. def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">; -def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">; +def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, st(0)}">; def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">; def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">; -def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">; +def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, st(0)}">; def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">; def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">; -def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">; +def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, st(0)}">; def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">; def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">; -def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">; +def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, st(0)}">; def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">; def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">; -def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">; +def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, st(0)}">; def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">; def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">; -def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">; +def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">; def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">; def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">; @@ -337,21 +337,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>; let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), - "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmovb\t{$op, %st(0)|st(0), $op}">, DA; def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), - "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmovbe\t{$op, %st(0)|st(0), $op}">, DA; def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), - "fcmove\t{$op, %st(0)|ST(0), $op}">, DA; + "fcmove\t{$op, %st(0)|st(0), $op}">, DA; def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), - "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA; + "fcmovu\t{$op, %st(0)|st(0), $op}">, DA; def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), - "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnb\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), - "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnbe\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), - "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovne\t{$op, %st(0)|st(0), $op}">, DB; def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), - "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB; + "fcmovnu\t{$op, %st(0)|st(0), $op}">, DB; } // Predicates = [HasCMov] // Floating point loads & stores. @@ -578,7 +578,7 @@ def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), let SchedRW = [WriteALU] in { let Defs = [AX], Uses = [FPSW] in def FNSTSW16r : I<0xE0, RawFrm, // AX = fp flags - (outs), (ins), "fnstsw %ax", + (outs), (ins), "fnstsw\t{%ax|ax}", [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF; def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td index a71e024..0fd9011 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -96,6 +96,20 @@ def SSEPackedSingle : Domain<1>; def SSEPackedDouble : Domain<2>; def SSEPackedInt : Domain<3>; +// Class specifying the vector form of the decompressed +// displacement of 8-bit. +class CD8VForm<bits<3> val> { + bits<3> Value = val; +} +def CD8VF : CD8VForm<0>; // v := VL +def CD8VH : CD8VForm<1>; // v := VL/2 +def CD8VQ : CD8VForm<2>; // v := VL/4 +def CD8VO : CD8VForm<3>; // v := VL/8 +def CD8VT1 : CD8VForm<4>; // v := 1 +def CD8VT2 : CD8VForm<5>; // v := 2 +def CD8VT4 : CD8VForm<6>; // v := 4 +def CD8VT8 : CD8VForm<7>; // v := 8 + // Prefix byte classes which are used to indicate to the ad-hoc machine code // emitter that various prefix bytes are required. class OpSize { bit hasOpSizePrefix = 1; } @@ -125,6 +139,7 @@ class T8XS { bits<5> Prefix = 18; } class TAXD { bits<5> Prefix = 19; } class XOP8 { bits<5> Prefix = 20; } class XOP9 { bits<5> Prefix = 21; } +class XOPA { bits<5> Prefix = 22; } class VEX { bit hasVEXPrefix = 1; } class VEX_W { bit hasVEX_WPrefix = 1; } class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; } @@ -132,6 +147,19 @@ class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; } class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } +class EVEX : VEX { bit hasEVEXPrefix = 1; } +class EVEX_4V : VEX_4V { bit hasEVEXPrefix = 1; } +class EVEX_K { bit hasEVEX_K = 1; } +class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; } +class EVEX_B { bit hasEVEX_B = 1; } +class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; } +class EVEX_CD8<int esize, CD8VForm form> { + bits<2> EVEX_CD8E = !if(!eq(esize, 8), 0b00, + !if(!eq(esize, 16), 0b01, + !if(!eq(esize, 32), 0b10, + !if(!eq(esize, 64), 0b11, ?)))); + bits<3> EVEX_CD8V = form.Value; +} class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } class MemOp4 { bit hasMemOp4Prefix = 1; } class XOP { bit hasXOP_Prefix = 1; } @@ -177,6 +205,13 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, // to be encoded in a immediate field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit + bit hasEVEXPrefix = 0; // Does this inst require EVEX form? + bit hasEVEX_K = 0; // Does this inst require masking? + bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field? + bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field? + bit hasEVEX_B = 0; // Does this inst set the EVEX_B field? + bits<2> EVEX_CD8E = 0; // Compressed disp8 form - element-size. + bits<3> EVEX_CD8V = 0; // Compressed disp8 form - vector-width. bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands bit hasXOP_Prefix = 0; // Does this inst require an XOP prefix? @@ -200,9 +235,16 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{37} = hasVEX_i8ImmReg; let TSFlags{38} = hasVEX_L; let TSFlags{39} = ignoresVEX_L; - let TSFlags{40} = has3DNow0F0FOpcode; - let TSFlags{41} = hasMemOp4Prefix; - let TSFlags{42} = hasXOP_Prefix; + let TSFlags{40} = hasEVEXPrefix; + let TSFlags{41} = hasEVEX_K; + let TSFlags{42} = hasEVEX_Z; + let TSFlags{43} = hasEVEX_L2; + let TSFlags{44} = hasEVEX_B; + let TSFlags{46-45} = EVEX_CD8E; + let TSFlags{49-47} = EVEX_CD8V; + let TSFlags{50} = has3DNow0F0FOpcode; + let TSFlags{51} = hasMemOp4Prefix; + let TSFlags{52} = hasXOP_Prefix; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -292,13 +334,17 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, } def __xs : XS; +def __xd : XD; // SI - SSE 1 & 2 scalar instructions class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin> { - let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); + let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512], + !if(hasVEXPrefix /* VEX */, [UseAVX], + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], + !if(!eq(Prefix, __xd.Prefix), [UseSSE2], + !if(hasOpSizePrefix, [UseSSE2], [UseSSE1]))))); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -308,8 +354,9 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm, class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin> { - let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); + let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512], + !if(hasVEXPrefix /* VEX */, [UseAVX], + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]))); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -319,8 +366,9 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin, Domain d> : I<o, F, outs, ins, asm, pattern, itin, d> { - let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); + let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512], + !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]))); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -337,11 +385,12 @@ class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> patter class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin, Domain d> : Ii8<o, F, outs, ins, asm, pattern, itin, d> { - let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX], - !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); + let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512], + !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]))); // AVX instructions have a 'v' prefix in the mnemonic - let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm); + let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); } // SSE1 Instruction Templates: @@ -350,7 +399,7 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, // PSI - SSE1 instructions with TB prefix. // PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix. // VSSI - SSE1 instructions with XS prefix in AVX form. -// VPSI - SSE1 instructions with TB prefix in AVX form. +// VPSI - SSE1 instructions with TB prefix in AVX form, packed single. class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> @@ -381,10 +430,13 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, // SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. // S2SI - SSE2 instructions with XS prefix. // SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix. -// PDI - SSE2 instructions with TB and OpSize prefixes. +// PDI - SSE2 instructions with TB and OpSize prefixes, packed double domain. // PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. -// VSDI - SSE2 instructions with XD prefix in AVX form. -// VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form. +// VSDI - SSE2 scalar instructions with XD prefix in AVX form. +// VPDI - SSE2 vector instructions with TB and OpSize prefixes in AVX form, +// packed double domain. +// VS2I - SSE2 scalar instructions with TB and OpSize prefixes in AVX form. +// S2I - SSE2 scalar instructions with TB and OpSize prefixes. // MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as // MMX operands. // MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as @@ -413,7 +465,7 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD, - Requires<[HasAVX]>; + Requires<[UseAVX]>; class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, @@ -422,6 +474,14 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB, OpSize, Requires<[HasAVX]>; +class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB, + OpSize, Requires<[UseAVX]>; +class S2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, TB, + OpSize, Requires<[UseSSE2]>; class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; @@ -539,12 +599,80 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, Requires<[HasAVX2]>; + +// AVX-512 Instruction Templates: +// Instructions introduced in AVX-512 (no SSE equivalent forms) +// +// AVX5128I - AVX-512 instructions with T8 and OpSize prefix. +// AVX512AIi8 - AVX-512 instructions with TA, OpSize prefix and ImmT = Imm8. +// AVX512PDI - AVX-512 instructions with TB, OpSize, double packed. +// AVX512PSI - AVX-512 instructions with TB, single packed. +// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes. +// AVX512XSI - AVX-512 instructions with XS prefix, generic domain. +// AVX512BI - AVX-512 instructions with TB, OpSize, int packed domain. +// AVX512SI - AVX-512 scalar instructions with TB and OpSize prefixes. + +class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize, + Requires<[HasAVX512]>; +class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS, + Requires<[HasAVX512]>; +class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XS, + Requires<[HasAVX512]>; +class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD, + Requires<[HasAVX512]>; +class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize, + Requires<[HasAVX512]>; +class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize, + Requires<[HasAVX512]>; +class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, + Requires<[HasAVX512]>; +class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, + OpSize, Requires<[HasAVX512]>; +class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB, + Requires<[HasAVX512]>; +class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>; +class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>; +class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8, + OpSize, EVEX_4V, Requires<[HasAVX512]>; + // AES Instruction Templates: // // AES8I // These use the same encoding as the SSE4.2 T8 and TA encodings. class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern, InstrItinClass itin = NoItinerary> + list<dag>pattern, InstrItinClass itin = IIC_AES> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, Requires<[HasAES]>; @@ -614,6 +742,13 @@ class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm, let CodeSize = 3; } +class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm64, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W; @@ -626,11 +761,18 @@ class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm, class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W; +class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W; +class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W; // MMX Instruction templates // // MMXI - MMX instructions with TB prefix. +// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode. // MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode. // MMX2I - MMX / SSE2 instructions with TB and OpSize prefixes. // MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix. @@ -640,6 +782,9 @@ class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm, class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>; +class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In32BitMode]>; class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 2a72fb6..1fed424 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -47,6 +47,8 @@ def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; +def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; @@ -103,6 +105,13 @@ def X86vsext : SDNode<"X86ISD::VSEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>]>>; +def X86vtrunc : SDNode<"X86ISD::VTRUNC", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>]>>; +def X86vtruncm : SDNode<"X86ISD::VTRUNCM", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisVec<2>, SDTCisInt<2>]>>; def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>]>>; @@ -116,6 +125,15 @@ def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; +def X86IntCmpMask : SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>; +def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>; +def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>; + +def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; + def X86vshl : SDNode<"X86ISD::VSHL", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVec<2>]>>; @@ -136,6 +154,11 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; +def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; +def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; +def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisVec<1>, + SDTCisSameAs<2, 1>]>>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, @@ -147,13 +170,17 @@ def X86pmuludq : SDNode<"X86ISD::PMULUDQ", def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>; +def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisInt<2>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; -def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; + def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; @@ -188,10 +215,14 @@ def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>; def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; +def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; +def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; @@ -229,13 +260,13 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [], def ssmem : Operand<v4f32> { let PrintMethod = "printf32mem"; let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; + let ParserMatchClass = X86Mem32AsmOperand; let OperandType = "OPERAND_MEMORY"; } def sdmem : Operand<v2f64> { let PrintMethod = "printf64mem"; let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; + let ParserMatchClass = X86Mem64AsmOperand; let OperandType = "OPERAND_MEMORY"; } @@ -255,9 +286,16 @@ def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; -// 128-/256-bit extload pattern fragments +// 512-bit load pattern fragments +def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; +def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; + +// 128-/256-/512-bit extload pattern fragments def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; +def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -271,6 +309,12 @@ def alignedstore256 : PatFrag<(ops node:$val, node:$ptr), return cast<StoreSDNode>(N)->getAlignment() >= 32; }]>; +// Like 'store', but always requires 512-bit vector alignment. +def alignedstore512 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 64; +}]>; + // Like 'load', but always requires 128-bit vector alignment. def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast<LoadSDNode>(N)->getAlignment() >= 16; @@ -286,6 +330,11 @@ def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast<LoadSDNode>(N)->getAlignment() >= 32; }]>; +// Like 'load', but always requires 512-bit vector alignment. +def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 64; +}]>; + def alignedloadfsf32 : PatFrag<(ops node:$ptr), (f32 (alignedload node:$ptr))>; def alignedloadfsf64 : PatFrag<(ops node:$ptr), @@ -309,6 +358,16 @@ def alignedloadv4f64 : PatFrag<(ops node:$ptr), def alignedloadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (alignedload256 node:$ptr))>; +// 512-bit aligned load pattern fragments +def alignedloadv16f32 : PatFrag<(ops node:$ptr), + (v16f32 (alignedload512 node:$ptr))>; +def alignedloadv16i32 : PatFrag<(ops node:$ptr), + (v16i32 (alignedload512 node:$ptr))>; +def alignedloadv8f64 : PatFrag<(ops node:$ptr), + (v8f64 (alignedload512 node:$ptr))>; +def alignedloadv8i64 : PatFrag<(ops node:$ptr), + (v8i64 (alignedload512 node:$ptr))>; + // Like 'load', but uses special alignment checks suitable for use in // memory operands in most SSE instructions, which are required to // be naturally aligned on some targets but not on others. If the subtarget @@ -320,6 +379,16 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ || cast<LoadSDNode>(N)->getAlignment() >= 16; }]>; +def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return Subtarget->hasVectorUAMem() + || cast<LoadSDNode>(N)->getAlignment() >= 4; +}]>; + +def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return Subtarget->hasVectorUAMem() + || cast<LoadSDNode>(N)->getAlignment() >= 8; +}]>; + def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; @@ -335,6 +404,12 @@ def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; +// 512-bit memop pattern fragments +def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>; +def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop8 node:$ptr))>; +def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>; +def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop8 node:$ptr))>; + // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. // FIXME: 8 byte alignment for mmx reads is not required @@ -384,6 +459,10 @@ def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>; def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; +// 512-bit bitconvert pattern fragments +def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>; +def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; + def vzmovl_v2i64 : PatFrag<(ops node:$src), (bitconvert (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; @@ -405,28 +484,54 @@ def BYTE_imm : SDNodeXForm<imm, [{ return getI32Imm(N->getZExtValue() >> 3); }]>; -// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index -// to VEXTRACTF128 imm. -def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{ - return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N)); +// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index +// to VEXTRACTF128/VEXTRACTI128 imm. +def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACT128Immediate(N)); +}]>; + +// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to +// VINSERTF128/VINSERTI128 imm. +def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERT128Immediate(N)); +}]>; + +// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index +// to VEXTRACTF64x4 imm. +def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACT256Immediate(N)); }]>; -// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to -// VINSERTF128 imm. -def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{ - return getI8Imm(X86::getInsertVINSERTF128Immediate(N)); +// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to +// VINSERTF64x4 imm. +def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERT256Immediate(N)); }]>; -def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index), +def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{ + return X86::isVEXTRACT128Index(N); +}], EXTRACT_get_vextract128_imm>; + +def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{ + return X86::isVINSERT128Index(N); +}], INSERT_get_vinsert128_imm>; + + +def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, node:$index), [{ - return X86::isVEXTRACTF128Index(N); -}], EXTRACT_get_vextractf128_imm>; + return X86::isVEXTRACT256Index(N); +}], EXTRACT_get_vextract256_imm>; -def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, +def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), (insert_subvector node:$bigvec, node:$smallvec, node:$index), [{ - return X86::isVINSERTF128Index(N); -}], INSERT_get_vinsertf128_imm>; + return X86::isVINSERT256Index(N); +}], INSERT_get_vinsert256_imm>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 423bd44..2461773 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" @@ -35,7 +36,7 @@ #include "llvm/Target/TargetOptions.h" #include <limits> -#define GET_INSTRINFO_CTOR +#define GET_INSTRINFO_CTOR_DTOR #include "X86GenInstrInfo.inc" using namespace llvm; @@ -81,6 +82,7 @@ enum { TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, + TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT, TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT }; @@ -90,6 +92,9 @@ struct X86OpTblEntry { uint16_t Flags; }; +// Pin the vtable to this file. +void X86InstrInfo::anchor() {} + X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit() ? X86::ADJCALLSTACKDOWN64 @@ -97,7 +102,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) (tm.getSubtarget<X86Subtarget>().is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), - TM(tm), RI(tm, *this) { + TM(tm), RI(tm) { static const X86OpTblEntry OpTbl2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, @@ -298,8 +303,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD }, { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD }, { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE }, - { X86::FsMOVAPDrr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, - { X86::FsMOVAPSrr, X86::MOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD }, { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD }, { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD }, @@ -356,8 +359,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, // AVX 128-bit versions of foldable instructions { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, - { X86::FsVMOVAPDrr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, - { X86::FsVMOVAPSrr, X86::VMOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -374,7 +375,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, - { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE } + { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions + { X86::VMOVPDI2DIZrr,X86::VMOVPDI2DIZmr, TB_FOLDED_STORE } }; for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { @@ -400,8 +403,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 }, { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 }, { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 }, - { X86::FsMOVAPDrr, X86::MOVSDrm, TB_NO_REVERSE }, - { X86::FsMOVAPSrr, X86::MOVSSrm, TB_NO_REVERSE }, { X86::IMUL16rri, X86::IMUL16rmi, 0 }, { X86::IMUL16rri8, X86::IMUL16rmi8, 0 }, { X86::IMUL32rri, X86::IMUL32rmi, 0 }, @@ -444,16 +445,12 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm, 0 }, { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, - { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 }, - { X86::MOVZX64rr32, X86::MOVZX64rm32, 0 }, - { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 }, { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, @@ -496,8 +493,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, - { X86::FsVMOVAPDrr, X86::VMOVSDrm, TB_NO_REVERSE }, - { X86::FsVMOVAPSrr, X86::VMOVSSrm, TB_NO_REVERSE }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, @@ -510,7 +505,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZDI2PDIrr, X86::VMOVZDI2PDIrm, 0 }, { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, @@ -555,11 +549,27 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, - // BMI/BMI2/LZCNT/POPCNT foldable instructions + // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, + { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 }, + { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 }, + { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 }, + { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 }, + { X86::BLCI32rr, X86::BLCI32rm, 0 }, + { X86::BLCI64rr, X86::BLCI64rm, 0 }, + { X86::BLCIC32rr, X86::BLCIC32rm, 0 }, + { X86::BLCIC64rr, X86::BLCIC64rm, 0 }, + { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 }, + { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 }, + { X86::BLCS32rr, X86::BLCS32rm, 0 }, + { X86::BLCS64rr, X86::BLCS64rm, 0 }, + { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 }, + { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 }, { X86::BLSI32rr, X86::BLSI32rm, 0 }, { X86::BLSI64rr, X86::BLSI64rm, 0 }, + { X86::BLSIC32rr, X86::BLSIC32rm, 0 }, + { X86::BLSIC64rr, X86::BLSIC64rm, 0 }, { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 }, { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 }, { X86::BLSR32rr, X86::BLSR32rm, 0 }, @@ -580,9 +590,27 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::SHRX64rr, X86::SHRX64rm, 0 }, { X86::SHLX32rr, X86::SHLX32rm, 0 }, { X86::SHLX64rr, X86::SHLX64rm, 0 }, + { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 }, + { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 }, { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, + { X86::TZMSK32rr, X86::TZMSK32rm, 0 }, + { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, + + // AVX-512 foldable instructions + { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, + { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, + { X86::VMOVDQA32rr, X86::VMOVDQA32rm, TB_ALIGN_64 }, + { X86::VMOVDQA64rr, X86::VMOVDQA64rm, TB_ALIGN_64 }, + { X86::VMOVDQU32rr, X86::VMOVDQU32rm, 0 }, + { X86::VMOVDQU64rr, X86::VMOVDQU64rm, 0 }, + + // AES foldable instructions + { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, + { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, + { X86::VAESIMCrr, X86::VAESIMCrm, TB_ALIGN_16 }, + { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { @@ -1180,6 +1208,52 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::PDEP64rr, X86::PDEP64rm, 0 }, { X86::PEXT32rr, X86::PEXT32rm, 0 }, { X86::PEXT64rr, X86::PEXT64rm, 0 }, + + // AVX-512 foldable instructions + { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, + { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, + { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, + { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, + { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, + { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, + { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, + { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, + { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, + { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, + { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, + { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, + { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, + { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, + { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, + { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, + { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, + { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, + { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, + { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, + { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, + { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, + { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, + { X86::VALIGNQrri, X86::VALIGNQrmi, 0 }, + { X86::VALIGNDrri, X86::VALIGNDrmi, 0 }, + + // AES foldable instructions + { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, + { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 }, + { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 }, + { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 }, + { X86::VAESDECLASTrr, X86::VAESDECLASTrm, TB_ALIGN_16 }, + { X86::VAESDECrr, X86::VAESDECrm, TB_ALIGN_16 }, + { X86::VAESENCLASTrr, X86::VAESENCLASTrm, TB_ALIGN_16 }, + { X86::VAESENCrr, X86::VAESENCrm, TB_ALIGN_16 }, + + // SHA foldable instructions + { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 }, + { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 }, + { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 }, + { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 }, + { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 }, + { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 }, + { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { @@ -1341,6 +1415,11 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 }, { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 }, { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 }, + // AVX-512 VPERMI instructions with 3 source operands. + { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, + { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, + { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, + { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { @@ -1381,7 +1460,6 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: - case X86::MOVZX64rr8: if (!TM.getSubtarget<X86Subtarget>().is64Bit()) // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. @@ -1389,9 +1467,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: - case X86::MOVZX64rr16: - case X86::MOVSX64rr32: - case X86::MOVZX64rr32: { + case X86::MOVSX64rr32: { if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) // Be conservative. return false; @@ -1404,17 +1480,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: - case X86::MOVZX64rr8: SubIdx = X86::sub_8bit; break; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: - case X86::MOVZX64rr16: SubIdx = X86::sub_16bit; break; case X86::MOVSX64rr32: - case X86::MOVZX64rr32: SubIdx = X86::sub_32bit; break; } @@ -1463,6 +1536,8 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::VMOVDQAYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: + case X86::VMOVDQA32rm: + case X86::VMOVDQA64rm: return true; } } @@ -1722,37 +1797,16 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const { - DebugLoc DL = Orig->getDebugLoc(); - - // MOV32r0 etc. are implemented with xor which clobbers condition code. - // Re-materialize them as movri instructions to avoid side effects. - bool Clone = true; + // MOV32r0 is implemented with a xor which clobbers condition code. + // Re-materialize it as movri instructions to avoid side effects. unsigned Opc = Orig->getOpcode(); - switch (Opc) { - default: break; - case X86::MOV8r0: - case X86::MOV16r0: - case X86::MOV32r0: - case X86::MOV64r0: { - if (!isSafeToClobberEFLAGS(MBB, I)) { - switch (Opc) { - default: llvm_unreachable("Unreachable!"); - case X86::MOV8r0: Opc = X86::MOV8ri; break; - case X86::MOV16r0: Opc = X86::MOV16ri; break; - case X86::MOV32r0: Opc = X86::MOV32ri; break; - case X86::MOV64r0: Opc = X86::MOV64ri64i32; break; - } - Clone = false; - } - break; - } - } - - if (Clone) { + if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) { + DebugLoc DL = Orig->getDebugLoc(); + BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) + .addImm(0); + } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); MBB.insert(I, MI); - } else { - BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0); } MachineInstr *NewMI = prior(I); @@ -1772,6 +1826,98 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) { return false; } +/// getTruncatedShiftCount - check whether the shift count for a machine operand +/// is non-zero. +inline static unsigned getTruncatedShiftCount(MachineInstr *MI, + unsigned ShiftAmtOperandIdx) { + // The shift count is six bits with the REX.W prefix and five bits without. + unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31; + unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm(); + return Imm & ShiftCountMask; +} + +/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate +/// can be represented by a LEA instruction. +inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { + // Left shift instructions can be transformed into load-effective-address + // instructions if we can encode them appropriately. + // A LEA instruction utilizes a SIB byte to encode it's scale factor. + // The SIB.scale field is two bits wide which means that we can encode any + // shift amount less than 4. + return ShAmt < 4 && ShAmt > 0; +} + +bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, + unsigned Opc, bool AllowSP, + unsigned &NewSrc, bool &isKill, bool &isUndef, + MachineOperand &ImplicitOp) const { + MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterClass *RC; + if (AllowSP) { + RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass; + } else { + RC = Opc != X86::LEA32r ? + &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; + } + unsigned SrcReg = Src.getReg(); + + // For both LEA64 and LEA32 the register already has essentially the right + // type (32-bit or 64-bit) we may just need to forbid SP. + if (Opc != X86::LEA64_32r) { + NewSrc = SrcReg; + isKill = Src.isKill(); + isUndef = Src.isUndef(); + + if (TargetRegisterInfo::isVirtualRegister(NewSrc) && + !MF.getRegInfo().constrainRegClass(NewSrc, RC)) + return false; + + return true; + } + + // This is for an LEA64_32r and incoming registers are 32-bit. One way or + // another we need to add 64-bit registers to the final MI. + if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + ImplicitOp = Src; + ImplicitOp.setImplicit(); + + NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64); + MachineBasicBlock::LivenessQueryResult LQR = + MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); + + switch (LQR) { + case MachineBasicBlock::LQR_Unknown: + // We can't give sane liveness flags to the instruction, abandon LEA + // formation. + return false; + case MachineBasicBlock::LQR_Live: + isKill = MI->killsRegister(SrcReg); + isUndef = false; + break; + default: + // The physreg itself is dead, so we have to use it as an <undef>. + isKill = false; + isUndef = true; + break; + } + } else { + // Virtual register of the wrong class, we have to create a temporary 64-bit + // vreg to feed into the LEA. + NewSrc = MF.getRegInfo().createVirtualRegister(RC); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + get(TargetOpcode::COPY)) + .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) + .addOperand(Src); + + // Which is obviously going to be dead after we're done with it. + isKill = true; + isUndef = false; + } + + // We've set all the parameters without issue. + return true; +} + /// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when /// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting /// to a 32-bit superregister and then truncating back down to a 16-bit @@ -1787,11 +1933,16 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, bool isDead = MI->getOperand(0).isDead(); bool isKill = MI->getOperand(1).isKill(); - unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() - ? X86::LEA64_32r : X86::LEA32r; MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); - unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); + unsigned Opc, leaInReg; + if (TM.getSubtarget<X86Subtarget>().is64Bit()) { + Opc = X86::LEA64_32r; + leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + } else { + Opc = X86::LEA32r; + leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + } // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. @@ -1841,7 +1992,10 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, // just a single insert_subreg. addRegReg(MIB, leaInReg, true, leaInReg, false); } else { - leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + else + leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2); @@ -1891,6 +2045,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const { MachineInstr *MI = MBBI; + + // The following opcodes also sets the condition code register(s). Only + // convert them to equivalent lea if the condition code register def's + // are dead! + if (hasLiveCondCodeDef(MI)) + return 0; + MachineFunction &MF = *MI->getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. const MachineOperand &Dest = MI->getOperand(0); @@ -1935,10 +2096,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); - // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses - // the flags produced by a shift yet, so this is safe. - unsigned ShAmt = MI->getOperand(2).getImm(); - if (ShAmt == 0 || ShAmt >= 4) return 0; + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; // LEA can't handle RSP. if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && @@ -1953,29 +2112,34 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::SHL32ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); - // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses - // the flags produced by a shift yet, so this is safe. - unsigned ShAmt = MI->getOperand(2).getImm(); - if (ShAmt == 0 || ShAmt >= 4) return 0; + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; + + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; // LEA can't handle ESP. - if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && - !MF.getRegInfo().constrainRegClass(Src.getReg(), - &X86::GR32_NOSPRegClass)) + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) return 0; - unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) - .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + .addReg(0).addImm(1 << ShAmt) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addImm(0).addReg(0); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + NewMI = MIB; + break; } case X86::SHL16ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); - // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses - // the flags produced by a shift yet, so this is safe. - unsigned ShAmt = MI->getOperand(2).getImm(); - if (ShAmt == 0 || ShAmt >= 4) return 0; + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return 0; if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; @@ -1985,11 +2149,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } default: { - // The following opcodes also sets the condition code register(s). Only - // convert them to equivalent lea if the condition code register def's - // are dead! - if (hasLiveCondCodeDef(MI)) - return 0; switch (MIOpc) { default: return 0; @@ -1999,17 +2158,20 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - const TargetRegisterClass *RC = MIOpc == X86::INC64r ? - (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : - (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; - - // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && - !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) return 0; - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest).addOperand(Src), 1); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, 1); break; } case X86::INC16r: @@ -2026,16 +2188,22 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - const TargetRegisterClass *RC = MIOpc == X86::DEC64r ? - (const TargetRegisterClass*)&X86::GR64_NOSPRegClass : - (const TargetRegisterClass*)&X86::GR32_NOSPRegClass; - // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && - !MF.getRegInfo().constrainRegClass(Src.getReg(), RC)) + + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) return 0; - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest).addOperand(Src), -1); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, -1); + break; } case X86::DEC16r: @@ -2052,36 +2220,41 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD32rr_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc; - const TargetRegisterClass *RC; - if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) { + if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) Opc = X86::LEA64r; - RC = &X86::GR64_NOSPRegClass; - } else { + else Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - RC = &X86::GR32_NOSPRegClass; - } - - unsigned Src2 = MI->getOperand(2).getReg(); - bool isKill2 = MI->getOperand(2).isKill(); + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return 0; - // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src2) && - !MF.getRegInfo().constrainRegClass(Src2, RC)) + const MachineOperand &Src2 = MI->getOperand(2); + bool isKill2, isUndef2; + unsigned SrcReg2; + MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, + SrcReg2, isKill2, isUndef2, ImplicitOp2)) return 0; - NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest), - Src.getReg(), Src.isKill(), Src2, isKill2); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + if (ImplicitOp2.getReg() != 0) + MIB.addOperand(ImplicitOp2); + + NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); // Preserve undefness of the operands. - bool isUndef = MI->getOperand(1).isUndef(); - bool isUndef2 = MI->getOperand(2).isUndef(); NewMI->getOperand(1).setIsUndef(isUndef); NewMI->getOperand(3).setIsUndef(isUndef2); - if (LV && isKill2) - LV->replaceKillInstruction(Src2, MI, NewMI); + if (LV && Src2.isKill()) + LV->replaceKillInstruction(SrcReg2, MI, NewMI); break; } case X86::ADD16rr: @@ -2120,9 +2293,21 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD32ri8_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest).addOperand(Src), - MI->getOperand(2).getImm()); + + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return 0; + + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, MI->getOperand(2).getImm()); break; } case X86::ADD16ri: @@ -2789,23 +2974,29 @@ static bool isHReg(unsigned Reg) { // Try and copy between VR128/VR64 and GR64 registers. static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, - bool HasAVX) { + const X86Subtarget& Subtarget) { + + // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) // SrcReg(GR64) -> DestReg(VR128) // SrcReg(GR64) -> DestReg(VR64) + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); if (X86::GR64RegClass.contains(DestReg)) { - if (X86::VR128RegClass.contains(SrcReg)) + if (X86::VR128XRegClass.contains(SrcReg)) // Copy from a VR128 register to a GR64 register. - return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr; + return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr : + X86::MOVPQIto64rr); if (X86::VR64RegClass.contains(SrcReg)) // Copy from a VR64 register to a GR64 register. return X86::MOVSDto64rr; } else if (X86::GR64RegClass.contains(SrcReg)) { // Copy from a GR64 register to a VR128 register. - if (X86::VR128RegClass.contains(DestReg)) - return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr; + if (X86::VR128XRegClass.contains(DestReg)) + return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr : + X86::MOV64toPQIrr); // Copy from a GR64 register to a VR64 register. if (X86::VR64RegClass.contains(DestReg)) return X86::MOV64toSDrr; @@ -2814,14 +3005,30 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, // SrcReg(FR32) -> DestReg(GR32) // SrcReg(GR32) -> DestReg(FR32) - if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg)) + if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg)) // Copy from a FR32 register to a GR32 register. - return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr; + return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr); - if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) // Copy from a GR32 register to a FR32 register. - return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr; + return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr); + return 0; +} +static +unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { + if (X86::VR128XRegClass.contains(DestReg, SrcReg) || + X86::VR256XRegClass.contains(DestReg, SrcReg) || + X86::VR512RegClass.contains(DestReg, SrcReg)) { + DestReg = get512BitSuperRegister(DestReg); + SrcReg = get512BitSuperRegister(SrcReg); + return X86::VMOVAPSZrr; + } + if ((X86::VK8RegClass.contains(DestReg) || + X86::VK16RegClass.contains(DestReg)) && + (X86::VK8RegClass.contains(SrcReg) || + X86::VK16RegClass.contains(SrcReg))) + return X86::KMOVWkk; return 0; } @@ -2831,7 +3038,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, bool KillSrc) const { // First deal with the normal symmetric copies. bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); - unsigned Opc; + bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + unsigned Opc = 0; if (X86::GR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV64rr; else if (X86::GR32RegClass.contains(DestReg, SrcReg)) @@ -2849,14 +3057,17 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, "8-bit H register can not be copied outside GR8_NOREX"); } else Opc = X86::MOV8rr; - } else if (X86::VR128RegClass.contains(DestReg, SrcReg)) + } + else if (X86::VR64RegClass.contains(DestReg, SrcReg)) + Opc = X86::MMX_MOVQ64rr; + else if (HasAVX512) + Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg); + else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) Opc = X86::VMOVAPSYrr; - else if (X86::VR64RegClass.contains(DestReg, SrcReg)) - Opc = X86::MMX_MOVQ64rr; - else - Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX); + if (!Opc) + Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>()); if (Opc) { BuildMI(MBB, MI, DL, get(Opc), DestReg) @@ -2904,6 +3115,18 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, bool isStackAligned, const TargetMachine &TM, bool load) { + if (TM.getSubtarget<X86Subtarget>().hasAVX512()) { + if (X86::VK8RegClass.hasSubClassEq(RC) || + X86::VK16RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVWkm : X86::KMOVWmk; + if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) + return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; + if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) + return load ? X86::VMOVSDZrm : X86::VMOVSDZmr; + if (X86::VR512RegClass.hasSubClassEq(RC)) + return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; + } + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); switch (RC->getSize()) { default: @@ -2945,7 +3168,8 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { - assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); + assert((X86::VR128RegClass.hasSubClassEq(RC) || + X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? @@ -2957,12 +3181,19 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); } case 32: - assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); + assert((X86::VR256RegClass.hasSubClassEq(RC) || + X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; else return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; + case 64: + assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); + if (isStackAligned) + return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; + else + return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } } @@ -2989,7 +3220,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const MachineFunction &MF = *MBB.getParent(); assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); @@ -3005,7 +3236,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl<MachineInstr*> &NewMIs) const { - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); @@ -3025,7 +3256,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); @@ -3039,7 +3270,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl<MachineInstr*> &NewMIs) const { - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); @@ -3171,6 +3402,25 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, inline static bool isDefConvertible(MachineInstr *MI) { switch (MI->getOpcode()) { default: return false; + + // The shift instructions only modify ZF if their shift count is non-zero. + // N.B.: The processor truncates the shift count depending on the encoding. + case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri: + case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri: + return getTruncatedShiftCount(MI, 2) != 0; + + // Some left shift instructions can be turned into LEA instructions but only + // if their flags aren't used. Avoid transforming such instructions. + case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{ + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (isTruncatedShiftCountForLEA(ShAmt)) return false; + return ShAmt != 0; + } + + case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8: + case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8: + return getTruncatedShiftCount(MI, 3) != 0; + case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr: @@ -3200,8 +3450,37 @@ inline static bool isDefConvertible(MachineInstr *MI) { case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: + case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: + case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1: + case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1: + case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1: + case X86::ADC32ri: case X86::ADC32ri8: + case X86::ADC32rr: case X86::ADC64ri32: + case X86::ADC64ri8: case X86::ADC64rr: + case X86::SBB32ri: case X86::SBB32ri8: + case X86::SBB32rr: case X86::SBB64ri32: + case X86::SBB64ri8: case X86::SBB64rr: case X86::ANDN32rr: case X86::ANDN32rm: case X86::ANDN64rr: case X86::ANDN64rm: + case X86::BEXTR32rr: case X86::BEXTR64rr: + case X86::BEXTR32rm: case X86::BEXTR64rm: + case X86::BLSI32rr: case X86::BLSI32rm: + case X86::BLSI64rr: case X86::BLSI64rm: + case X86::BLSMSK32rr:case X86::BLSMSK32rm: + case X86::BLSMSK64rr:case X86::BLSMSK64rm: + case X86::BLSR32rr: case X86::BLSR32rm: + case X86::BLSR64rr: case X86::BLSR64rm: + case X86::BZHI32rr: case X86::BZHI32rm: + case X86::BZHI64rr: case X86::BZHI64rm: + case X86::LZCNT16rr: case X86::LZCNT16rm: + case X86::LZCNT32rr: case X86::LZCNT32rm: + case X86::LZCNT64rr: case X86::LZCNT64rm: + case X86::POPCNT16rr:case X86::POPCNT16rm: + case X86::POPCNT32rr:case X86::POPCNT32rm: + case X86::POPCNT64rr:case X86::POPCNT64rm: + case X86::TZCNT16rr: case X86::TZCNT16rm: + case X86::TZCNT32rr: case X86::TZCNT32rm: + case X86::TZCNT64rr: case X86::TZCNT64rm: return true; } } @@ -3308,10 +3587,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // MOV32r0 etc. are implemented with xor which clobbers condition code. // They are safe to move up, if the definition to EFLAGS is dead and // earlier instructions do not read or write EFLAGS. - if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 || - Instr->getOpcode() == X86::MOV16r0 || - Instr->getOpcode() == X86::MOV32r0 || - Instr->getOpcode() == X86::MOV64r0) && + if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 && Instr->registerDefIsDead(X86::EFLAGS, TRI)) { Movr0Inst = Instr; continue; @@ -3420,20 +3696,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // The instruction to be updated is either Sub or MI. Sub = IsCmpZero ? MI : Sub; - // Move Movr0Inst to the place right before Sub. + // Move Movr0Inst to the appropriate place before Sub. if (Movr0Inst) { - Sub->getParent()->remove(Movr0Inst); - Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst); + // Look backwards until we find a def that doesn't use the current EFLAGS. + Def = Sub; + MachineBasicBlock::reverse_iterator + InsertI = MachineBasicBlock::reverse_iterator(++Def), + InsertE = Sub->getParent()->rend(); + for (; InsertI != InsertE; ++InsertI) { + MachineInstr *Instr = &*InsertI; + if (!Instr->readsRegister(X86::EFLAGS, TRI) && + Instr->modifiesRegister(X86::EFLAGS, TRI)) { + Sub->getParent()->remove(Movr0Inst); + Instr->getParent()->insert(MachineBasicBlock::iterator(Instr), + Movr0Inst); + break; + } + } + if (InsertI == InsertE) + return false; } // Make sure Sub instruction defines EFLAGS and mark the def live. - unsigned LastOperand = Sub->getNumOperands() - 1; - assert(Sub->getNumOperands() >= 2 && - Sub->getOperand(LastOperand).isReg() && - Sub->getOperand(LastOperand).getReg() == X86::EFLAGS && - "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND"); - Sub->getOperand(LastOperand).setIsDef(true); - Sub->getOperand(LastOperand).setIsDead(false); + unsigned i = 0, e = Sub->getNumOperands(); + for (; i != e; ++i) { + MachineOperand &MO = Sub->getOperand(i); + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + MO.setIsDead(false); + break; + } + } + assert(i != e && "Unable to locate a def EFLAGS operand"); + CmpInstr->eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. @@ -3558,6 +3852,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::AVX_SET0: assert(HasAVX && "AVX not supported"); return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); + case X86::AVX512_512_SET0: + return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: @@ -3565,23 +3861,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; + case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); + case X86::KSET1B: + case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); } return false; } -MachineInstr* -X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const { - X86AddressMode AM; - AM.BaseType = X86AddressMode::FrameIndexBase; - AM.Base.FrameIndex = FrameIx; - MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE)); - addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr); - return &*MIB; -} - static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, const SmallVectorImpl<MachineOperand> &MOs, MachineInstr *MI, @@ -3686,18 +3972,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; } else if (i == 0) { // If operand 0 - unsigned Opc = 0; - switch (MI->getOpcode()) { - default: break; - case X86::MOV64r0: Opc = X86::MOV64mi32; break; - case X86::MOV32r0: Opc = X86::MOV32mi; break; - case X86::MOV16r0: Opc = X86::MOV16mi; break; - case X86::MOV8r0: Opc = X86::MOV8mi; break; + if (MI->getOpcode() == X86::MOV32r0) { + NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI); + if (NewMI) + return NewMI; } - if (Opc) - NewMI = MakeM0Inst(*this, Opc, MOs, MI); - if (NewMI) - return NewMI; OpcodeTablePtr = &RegOp2MemOpTable0; } else if (i == 1) { @@ -3798,18 +4077,6 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::RSQRTSSr_Int: case X86::SQRTSSr: case X86::SQRTSSr_Int: - // AVX encoded versions - case X86::VCVTSD2SSrr: - case X86::Int_VCVTSD2SSrr: - case X86::VCVTSS2SDrr: - case X86::Int_VCVTSS2SDrr: - case X86::VRCPSSr: - case X86::VROUNDSDr: - case X86::VROUNDSDr_Int: - case X86::VROUNDSSr: - case X86::VROUNDSSr_Int: - case X86::VRSQRTSSr: - case X86::VSQRTSSr: return true; } @@ -3841,10 +4108,77 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, return 16; } +// Return true for any instruction the copies the high bits of the first source +// operand into the unused high bits of the destination operand. +static bool hasUndefRegUpdate(unsigned Opcode) { + switch (Opcode) { + case X86::VCVTSI2SSrr: + case X86::Int_VCVTSI2SSrr: + case X86::VCVTSI2SS64rr: + case X86::Int_VCVTSI2SS64rr: + case X86::VCVTSI2SDrr: + case X86::Int_VCVTSI2SDrr: + case X86::VCVTSI2SD64rr: + case X86::Int_VCVTSI2SD64rr: + case X86::VCVTSD2SSrr: + case X86::Int_VCVTSD2SSrr: + case X86::VCVTSS2SDrr: + case X86::Int_VCVTSS2SDrr: + case X86::VRCPSSr: + case X86::VROUNDSDr: + case X86::VROUNDSDr_Int: + case X86::VROUNDSSr: + case X86::VROUNDSSr_Int: + case X86::VRSQRTSSr: + case X86::VSQRTSSr: + + // AVX-512 + case X86::VCVTSD2SSZrr: + case X86::VCVTSS2SDZrr: + return true; + } + + return false; +} + +/// Inform the ExeDepsFix pass how many idle instructions we would like before +/// certain undef register reads. +/// +/// This catches the VCVTSI2SD family of instructions: +/// +/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14 +/// +/// We should to be careful *not* to catch VXOR idioms which are presumably +/// handled specially in the pipeline: +/// +/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1 +/// +/// Like getPartialRegUpdateClearance, this makes a strong assumption that the +/// high bits that are passed-through are not live. +unsigned X86InstrInfo:: +getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, + const TargetRegisterInfo *TRI) const { + if (!hasUndefRegUpdate(MI->getOpcode())) + return 0; + + // Set the OpNum parameter to the first source operand. + OpNum = 1; + + const MachineOperand &MO = MI->getOperand(OpNum); + if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + // Use the same magic number as getPartialRegUpdateClearance. + return 16; + } + return 0; +} + void X86InstrInfo:: breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { unsigned Reg = MI->getOperand(OpNum).getReg(); + // If MI kills this register, the false dependence is already broken. + if (MI->killsRegister(Reg, TRI)) + return; if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. @@ -3864,10 +4198,75 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, MI->addRegisterKilled(Reg, TRI, true); } -MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const { +static MachineInstr* foldPatchpoint(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex, + const TargetInstrInfo &TII) { + unsigned StartIdx = 0; + switch (MI->getOpcode()) { + case TargetOpcode::STACKMAP: + StartIdx = 2; // Skip ID, nShadowBytes. + break; + case TargetOpcode::PATCHPOINT: { + // For PatchPoint, the call args are not foldable. + PatchPointOpers opers(MI); + StartIdx = opers.getVarIdx(); + break; + } + default: + llvm_unreachable("unexpected stackmap opcode"); + } + + // Return false if any operands requested for folding are not foldable (not + // part of the stackmap's live values). + for (SmallVectorImpl<unsigned>::const_iterator I = Ops.begin(), E = Ops.end(); + I != E; ++I) { + if (*I < StartIdx) + return 0; + } + + MachineInstr *NewMI = + MF.CreateMachineInstr(TII.get(MI->getOpcode()), MI->getDebugLoc(), true); + MachineInstrBuilder MIB(MF, NewMI); + + // No need to fold return, the meta data, and function arguments + for (unsigned i = 0; i < StartIdx; ++i) + MIB.addOperand(MI->getOperand(i)); + + for (unsigned i = StartIdx; i < MI->getNumOperands(); ++i) { + MachineOperand &MO = MI->getOperand(i); + if (std::find(Ops.begin(), Ops.end(), i) != Ops.end()) { + assert(MO.getReg() && "patchpoint can only fold a vreg operand"); + // Compute the spill slot size and offset. + const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(MO.getReg()); + unsigned SpillSize; + unsigned SpillOffset; + bool Valid = TII.getStackSlotRange(RC, MO.getSubReg(), SpillSize, + SpillOffset, &MF.getTarget()); + if (!Valid) + report_fatal_error("cannot spill patchpoint subregister operand"); + + MIB.addOperand(MachineOperand::CreateImm(StackMaps::IndirectMemRefOp)); + MIB.addOperand(MachineOperand::CreateImm(SpillSize)); + MIB.addOperand(MachineOperand::CreateFI(FrameIndex)); + addOffset(MIB, SpillOffset); + } + else + MIB.addOperand(MO); + } + return NewMI; +} + +MachineInstr* +X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const { + // Special case stack map and patch point intrinsics. + if (MI->getOpcode() == TargetOpcode::STACKMAP + || MI->getOpcode() == TargetOpcode::PATCHPOINT) { + return foldPatchpoint(MF, MI, Ops, FrameIndex, *this); + } // Check switch flag if (NoFusing) return NULL; @@ -3914,6 +4313,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, MachineInstr *LoadMI) const { + // If loading from a FrameIndex, fold directly from the FrameIndex. + unsigned NumOps = LoadMI->getDesc().getNumOperands(); + int FrameIndex; + if (isLoadFromStackSlot(LoadMI, FrameIndex)) + return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex); + // Check switch flag if (NoFusing) return NULL; @@ -4039,7 +4444,6 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return NULL; // Folding a normal load. Just copy the load's address operands. - unsigned NumOps = LoadMI->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) MOs.push_back(LoadMI->getOperand(i)); break; @@ -4087,13 +4491,9 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, if (isTwoAddr && NumOps >= 2 && OpNum < 2) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; } else if (OpNum == 0) { // If operand 0 - switch (Opc) { - case X86::MOV8r0: - case X86::MOV16r0: - case X86::MOV32r0: - case X86::MOV64r0: return true; - default: break; - } + if (Opc == X86::MOV32r0) + return true; + OpcodeTablePtr = &RegOp2MemOpTable0; } else if (OpNum == 1) { OpcodeTablePtr = &RegOp2MemOpTable1; @@ -4254,7 +4654,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, std::vector<SDValue> AddrOps; std::vector<SDValue> BeforeOps; std::vector<SDValue> AfterOps; - DebugLoc dl = N->getDebugLoc(); + SDLoc dl(N); unsigned NumOps = N->getNumOperands(); for (unsigned i = 0; i != NumOps-1; ++i) { SDValue Op = N->getOperand(i); @@ -4511,6 +4911,167 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } +bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const { + // Check if this processor supports macro-fusion. Since this is a minor + // heuristic, we haven't specifically reserved a feature. hasAVX is a decent + // proxy for SandyBridge+. + if (!TM.getSubtarget<X86Subtarget>().hasAVX()) + return false; + + enum { + FuseTest, + FuseCmp, + FuseInc + } FuseKind; + + switch(Second->getOpcode()) { + default: + return false; + case X86::JE_4: + case X86::JNE_4: + case X86::JL_4: + case X86::JLE_4: + case X86::JG_4: + case X86::JGE_4: + FuseKind = FuseInc; + break; + case X86::JB_4: + case X86::JBE_4: + case X86::JA_4: + case X86::JAE_4: + FuseKind = FuseCmp; + break; + case X86::JS_4: + case X86::JNS_4: + case X86::JP_4: + case X86::JNP_4: + case X86::JO_4: + case X86::JNO_4: + FuseKind = FuseTest; + break; + } + switch (First->getOpcode()) { + default: + return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + case X86::TEST8ri: + case X86::TEST16ri: + case X86::TEST32ri: + case X86::TEST32i32: + case X86::TEST64i32: + case X86::TEST64ri32: + case X86::TEST8rm: + case X86::TEST16rm: + case X86::TEST32rm: + case X86::TEST64rm: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND16rm: + case X86::AND16rr: + case X86::AND32i32: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND32rm: + case X86::AND32rr: + case X86::AND64i32: + case X86::AND64ri32: + case X86::AND64ri8: + case X86::AND64rm: + case X86::AND64rr: + case X86::AND8i8: + case X86::AND8ri: + case X86::AND8rm: + case X86::AND8rr: + return true; + case X86::CMP16i16: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP16rm: + case X86::CMP16rr: + case X86::CMP32i32: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP32rm: + case X86::CMP32rr: + case X86::CMP64i32: + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP64rm: + case X86::CMP64rr: + case X86::CMP8i8: + case X86::CMP8ri: + case X86::CMP8rm: + case X86::CMP8rr: + case X86::ADD16i16: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri8_DB: + case X86::ADD16ri_DB: + case X86::ADD16rm: + case X86::ADD16rr: + case X86::ADD16rr_DB: + case X86::ADD32i32: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri8_DB: + case X86::ADD32ri_DB: + case X86::ADD32rm: + case X86::ADD32rr: + case X86::ADD32rr_DB: + case X86::ADD64i32: + case X86::ADD64ri32: + case X86::ADD64ri32_DB: + case X86::ADD64ri8: + case X86::ADD64ri8_DB: + case X86::ADD64rm: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD8i8: + case X86::ADD8mi: + case X86::ADD8mr: + case X86::ADD8ri: + case X86::ADD8rm: + case X86::ADD8rr: + case X86::SUB16i16: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB16rm: + case X86::SUB16rr: + case X86::SUB32i32: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB32rm: + case X86::SUB32rr: + case X86::SUB64i32: + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB64rm: + case X86::SUB64rr: + case X86::SUB8i8: + case X86::SUB8ri: + case X86::SUB8rm: + case X86::SUB8rr: + return FuseKind == FuseCmp || FuseKind == FuseInc; + case X86::INC16r: + case X86::INC32r: + case X86::INC64_16r: + case X86::INC64_32r: + case X86::INC64r: + case X86::INC8r: + case X86::DEC16r: + case X86::DEC32r: + case X86::DEC64_16r: + case X86::DEC64_32r: + case X86::DEC64r: + case X86::DEC8r: + return FuseKind == FuseInc; + } +} bool X86InstrInfo:: ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { @@ -4704,6 +5265,37 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSSr: + case X86::VSQRTPDZrm: + case X86::VSQRTPDZrr: + case X86::VSQRTPSZrm: + case X86::VSQRTPSZrr: + case X86::VSQRTSDZm: + case X86::VSQRTSDZm_Int: + case X86::VSQRTSDZr: + case X86::VSQRTSSZm_Int: + case X86::VSQRTSSZr: + case X86::VSQRTSSZm: + case X86::VDIVSDZrm: + case X86::VDIVSDZrr: + case X86::VDIVSSZrm: + case X86::VDIVSSZrr: + + case X86::VGATHERQPSZrm: + case X86::VGATHERQPDZrm: + case X86::VGATHERDPDZrm: + case X86::VGATHERDPSZrm: + case X86::VPGATHERQDZrm: + case X86::VPGATHERQQZrm: + case X86::VPGATHERDDZrm: + case X86::VPGATHERDQZrm: + case X86::VSCATTERQPDZmr: + case X86::VSCATTERQPSZmr: + case X86::VSCATTERDPDZmr: + case X86::VSCATTERDPSZmr: + case X86::VPSCATTERQDZmr: + case X86::VPSCATTERQQZmr: + case X86::VPSCATTERDDZmr: + case X86::VPSCATTERDQZmr: return true; } } diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index 260f054..600e392 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -152,6 +152,8 @@ class X86InstrInfo : public X86GenInstrInfo { MemOp2RegOpTableType &M2RTable, unsigned RegOp, unsigned MemOp, unsigned Flags); + virtual void anchor(); + public: explicit X86InstrInfo(X86TargetMachine &tm); @@ -192,6 +194,19 @@ public: const MachineInstr *Orig, const TargetRegisterInfo &TRI) const; + /// Given an operand within a MachineInstr, insert preceding code to put it + /// into the right format for a particular kind of LEA instruction. This may + /// involve using an appropriate super-register instead (with an implicit use + /// of the original) or creating a new virtual register and inserting COPY + /// instructions to get the data into the right class. + /// + /// Reference parameters are set to indicate how caller should add this + /// operand to the LEA instruction. + bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, + unsigned LEAOpcode, bool AllowSP, + unsigned &NewSrc, bool &isKill, + bool &isUndef, MachineOperand &ImplicitOp) const; + /// convertToThreeAddress - This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target /// may be able to convert a two-address instruction into a true @@ -262,12 +277,6 @@ public: virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; - virtual - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const; - /// foldMemoryOperand - If this target supports it, fold a load or store of /// the specified stack slot into the specified machine instruction for the /// specified operand(s). If this is possible, the target should perform the @@ -332,6 +341,9 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const; + virtual bool shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const LLVM_OVERRIDE; + virtual void getNoopForMachoTarget(MCInst &NopInst) const; virtual @@ -359,6 +371,8 @@ public: unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, const TargetRegisterInfo *TRI) const; + unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, + const TargetRegisterInfo *TRI) const; void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, const TargetRegisterInfo *TRI) const; diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index 0743701..6e5d543 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -248,11 +248,12 @@ def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, [SDNPCommutative]>; -def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>; def X86blsi : SDNode<"X86ISD::BLSI", SDTIntUnaryOp>; def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>; def X86blsr : SDNode<"X86ISD::BLSR", SDTIntUnaryOp>; +def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntShiftOp>; +def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; @@ -278,43 +279,52 @@ def ptr_rc_nosp : PointerLikeRegClass<1>; // *mem - Operand definitions for the funky X86 addressing mode operands. // -def X86MemAsmOperand : AsmOperandClass { - let Name = "Mem"; let PredicateMethod = "isMem"; +def X86MemAsmOperand : AsmOperandClass { + let Name = "Mem"; } -def X86Mem8AsmOperand : AsmOperandClass { - let Name = "Mem8"; let PredicateMethod = "isMem8"; +def X86Mem8AsmOperand : AsmOperandClass { + let Name = "Mem8"; let RenderMethod = "addMemOperands"; } -def X86Mem16AsmOperand : AsmOperandClass { - let Name = "Mem16"; let PredicateMethod = "isMem16"; +def X86Mem16AsmOperand : AsmOperandClass { + let Name = "Mem16"; let RenderMethod = "addMemOperands"; } -def X86Mem32AsmOperand : AsmOperandClass { - let Name = "Mem32"; let PredicateMethod = "isMem32"; +def X86Mem32AsmOperand : AsmOperandClass { + let Name = "Mem32"; let RenderMethod = "addMemOperands"; } -def X86Mem64AsmOperand : AsmOperandClass { - let Name = "Mem64"; let PredicateMethod = "isMem64"; +def X86Mem64AsmOperand : AsmOperandClass { + let Name = "Mem64"; let RenderMethod = "addMemOperands"; } -def X86Mem80AsmOperand : AsmOperandClass { - let Name = "Mem80"; let PredicateMethod = "isMem80"; +def X86Mem80AsmOperand : AsmOperandClass { + let Name = "Mem80"; let RenderMethod = "addMemOperands"; } -def X86Mem128AsmOperand : AsmOperandClass { - let Name = "Mem128"; let PredicateMethod = "isMem128"; +def X86Mem128AsmOperand : AsmOperandClass { + let Name = "Mem128"; let RenderMethod = "addMemOperands"; } -def X86Mem256AsmOperand : AsmOperandClass { - let Name = "Mem256"; let PredicateMethod = "isMem256"; +def X86Mem256AsmOperand : AsmOperandClass { + let Name = "Mem256"; let RenderMethod = "addMemOperands"; +} +def X86Mem512AsmOperand : AsmOperandClass { + let Name = "Mem512"; let RenderMethod = "addMemOperands"; } // Gather mem operands def X86MemVX32Operand : AsmOperandClass { - let Name = "MemVX32"; let PredicateMethod = "isMemVX32"; + let Name = "MemVX32"; let RenderMethod = "addMemOperands"; } def X86MemVY32Operand : AsmOperandClass { - let Name = "MemVY32"; let PredicateMethod = "isMemVY32"; + let Name = "MemVY32"; let RenderMethod = "addMemOperands"; +} +def X86MemVZ32Operand : AsmOperandClass { + let Name = "MemVZ32"; let RenderMethod = "addMemOperands"; } def X86MemVX64Operand : AsmOperandClass { - let Name = "MemVX64"; let PredicateMethod = "isMemVX64"; + let Name = "MemVX64"; let RenderMethod = "addMemOperands"; } def X86MemVY64Operand : AsmOperandClass { - let Name = "MemVY64"; let PredicateMethod = "isMemVY64"; + let Name = "MemVY64"; let RenderMethod = "addMemOperands"; +} +def X86MemVZ64Operand : AsmOperandClass { + let Name = "MemVZ64"; let RenderMethod = "addMemOperands"; } def X86AbsMemAsmOperand : AsmOperandClass { @@ -333,28 +343,36 @@ def opaque48mem : X86MemOperand<"printopaquemem">; def opaque80mem : X86MemOperand<"printopaquemem">; def opaque512mem : X86MemOperand<"printopaquemem">; -def i8mem : X86MemOperand<"printi8mem"> { +def i8mem : X86MemOperand<"printi8mem"> { let ParserMatchClass = X86Mem8AsmOperand; } -def i16mem : X86MemOperand<"printi16mem"> { +def i16mem : X86MemOperand<"printi16mem"> { let ParserMatchClass = X86Mem16AsmOperand; } -def i32mem : X86MemOperand<"printi32mem"> { +def i32mem : X86MemOperand<"printi32mem"> { let ParserMatchClass = X86Mem32AsmOperand; } -def i64mem : X86MemOperand<"printi64mem"> { +def i64mem : X86MemOperand<"printi64mem"> { let ParserMatchClass = X86Mem64AsmOperand; } -def i128mem : X86MemOperand<"printi128mem"> { +def i128mem : X86MemOperand<"printi128mem"> { let ParserMatchClass = X86Mem128AsmOperand; } -def i256mem : X86MemOperand<"printi256mem"> { +def i256mem : X86MemOperand<"printi256mem"> { let ParserMatchClass = X86Mem256AsmOperand; } -def f32mem : X86MemOperand<"printf32mem"> { +def i512mem : X86MemOperand<"printi512mem"> { + let ParserMatchClass = X86Mem512AsmOperand; } +def f32mem : X86MemOperand<"printf32mem"> { let ParserMatchClass = X86Mem32AsmOperand; } -def f64mem : X86MemOperand<"printf64mem"> { +def f64mem : X86MemOperand<"printf64mem"> { let ParserMatchClass = X86Mem64AsmOperand; } -def f80mem : X86MemOperand<"printf80mem"> { +def f80mem : X86MemOperand<"printf80mem"> { let ParserMatchClass = X86Mem80AsmOperand; } -def f128mem : X86MemOperand<"printf128mem"> { +def f128mem : X86MemOperand<"printf128mem"> { let ParserMatchClass = X86Mem128AsmOperand; } -def f256mem : X86MemOperand<"printf256mem">{ +def f256mem : X86MemOperand<"printf256mem">{ let ParserMatchClass = X86Mem256AsmOperand; } +def f512mem : X86MemOperand<"printf512mem">{ + let ParserMatchClass = X86Mem512AsmOperand; } +def v512mem : Operand<iPTR> { + let PrintMethod = "printf512mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86Mem512AsmOperand; } // Gather mem operands def vx32mem : X86MemOperand<"printi32mem">{ @@ -369,6 +387,15 @@ def vx64mem : X86MemOperand<"printi64mem">{ def vy64mem : X86MemOperand<"printi64mem">{ let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); let ParserMatchClass = X86MemVY64Operand; } +def vy64xmem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm); + let ParserMatchClass = X86MemVY64Operand; } +def vz32mem : X86MemOperand<"printi32mem">{ + let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86MemVZ32Operand; } +def vz64mem : X86MemOperand<"printi64mem">{ + let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); + let ParserMatchClass = X86MemVZ64Operand; } } // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of @@ -412,17 +439,49 @@ let OperandType = "OPERAND_PCREL", def i32imm_pcrel : Operand<i32>; def i16imm_pcrel : Operand<i16>; -def offset8 : Operand<i64>; -def offset16 : Operand<i64>; -def offset32 : Operand<i64>; -def offset64 : Operand<i64>; - // Branch targets have OtherVT type and print as pc-relative values. def brtarget : Operand<OtherVT>; def brtarget8 : Operand<OtherVT>; } +def X86MemOffs8AsmOperand : AsmOperandClass { + let Name = "MemOffs8"; + let RenderMethod = "addMemOffsOperands"; + let SuperClasses = [X86Mem8AsmOperand]; +} +def X86MemOffs16AsmOperand : AsmOperandClass { + let Name = "MemOffs16"; + let RenderMethod = "addMemOffsOperands"; + let SuperClasses = [X86Mem16AsmOperand]; +} +def X86MemOffs32AsmOperand : AsmOperandClass { + let Name = "MemOffs32"; + let RenderMethod = "addMemOffsOperands"; + let SuperClasses = [X86Mem32AsmOperand]; +} +def X86MemOffs64AsmOperand : AsmOperandClass { + let Name = "MemOffs64"; + let RenderMethod = "addMemOffsOperands"; + let SuperClasses = [X86Mem64AsmOperand]; +} + +let OperandType = "OPERAND_MEMORY" in { +def offset8 : Operand<i64> { + let ParserMatchClass = X86MemOffs8AsmOperand; + let PrintMethod = "printMemOffs8"; } +def offset16 : Operand<i64> { + let ParserMatchClass = X86MemOffs16AsmOperand; + let PrintMethod = "printMemOffs16"; } +def offset32 : Operand<i64> { + let ParserMatchClass = X86MemOffs32AsmOperand; + let PrintMethod = "printMemOffs32"; } +def offset64 : Operand<i64> { + let ParserMatchClass = X86MemOffs64AsmOperand; + let PrintMethod = "printMemOffs64"; } +} + + def SSECC : Operand<i8> { let PrintMethod = "printSSECC"; let OperandType = "OPERAND_IMMEDIATE"; @@ -443,6 +502,14 @@ class ImmZExtAsmOperandClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +def X86GR32orGR64AsmOperand : AsmOperandClass { + let Name = "GR32orGR64"; +} + +def GR32orGR64 : RegisterOperand<GR32> { + let ParserMatchClass = X86GR32orGR64AsmOperand; +} + // Sign-extended immediate classes. We don't need to define the full lattice // here because there is no instruction with an ambiguity between ImmSExti64i32 // and ImmSExti32i8. @@ -523,8 +590,7 @@ def i64i8imm : Operand<i64> { def lea64_32mem : Operand<i32> { let PrintMethod = "printi32mem"; - let AsmOperandLowerMethod = "lower_lea64_32mem"; - let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm); + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); let ParserMatchClass = X86MemAsmOperand; } @@ -546,7 +612,7 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex], []>; // In 64-bit mode 32-bit LEAs can use RIP-relative addressing. -def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr", +def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; @@ -591,13 +657,22 @@ def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; +def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; +def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; +def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; +def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; +def HasCDI : Predicate<"Subtarget->hasCDI()">; +def HasPFI : Predicate<"Subtarget->hasPFI()">; +def HasERI : Predicate<"Subtarget->hasERI()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; +def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; def HasXOP : Predicate<"Subtarget->hasXOP()">; +def HasTBM : Predicate<"Subtarget->hasTBM()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; def HasF16C : Predicate<"Subtarget->hasF16C()">; @@ -609,9 +684,10 @@ def HasRTM : Predicate<"Subtarget->hasRTM()">; def HasHLE : Predicate<"Subtarget->hasHLE()">; def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; def HasADX : Predicate<"Subtarget->hasADX()">; +def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; -def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">; +def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; @@ -804,11 +880,11 @@ def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>; def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG>, OpSize; -def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [], +def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [], IIC_POP_MEM>, OpSize; def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>; -def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [], +def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [], IIC_POP_MEM>; def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize; @@ -852,7 +928,7 @@ def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>; def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>; -def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [], +def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [], IIC_POP_MEM>; } // mayLoad, SchedRW let mayStore = 1, SchedRW = [WriteStore] in { @@ -910,53 +986,56 @@ let Defs = [EFLAGS] in { def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsf{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))], - IIC_BSF>, TB, OpSize, Sched<[WriteShift]>; + IIC_BIT_SCAN_REG>, TB, OpSize, Sched<[WriteShift]>; def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsf{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))], - IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>; + IIC_BIT_SCAN_MEM>, TB, OpSize, Sched<[WriteShiftLd]>; def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsf{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB, + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], + IIC_BIT_SCAN_REG>, TB, Sched<[WriteShift]>; def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsf{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))], - IIC_BSF>, TB, Sched<[WriteShiftLd]>; + IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>; def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsf{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))], - IIC_BSF>, TB, Sched<[WriteShift]>; + IIC_BIT_SCAN_REG>, TB, Sched<[WriteShift]>; def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsf{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))], - IIC_BSF>, TB, Sched<[WriteShiftLd]>; + IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>; def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "bsr{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>, + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], + IIC_BIT_SCAN_REG>, TB, OpSize, Sched<[WriteShift]>; def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "bsr{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))], - IIC_BSR>, TB, + IIC_BIT_SCAN_MEM>, TB, OpSize, Sched<[WriteShiftLd]>; def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "bsr{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB, + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], + IIC_BIT_SCAN_REG>, TB, Sched<[WriteShift]>; def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "bsr{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))], - IIC_BSR>, TB, Sched<[WriteShiftLd]>; + IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>; def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "bsr{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB, + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BIT_SCAN_REG>, TB, Sched<[WriteShift]>; def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "bsr{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))], - IIC_BSR>, TB, Sched<[WriteShiftLd]>; + IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>; } // Defs = [EFLAGS] let SchedRW = [WriteMicrocoded] in { @@ -1038,44 +1117,67 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; } // SchedRW +let hasSideEffects = 0 in { + /// moffs8, moffs16 and moffs32 versions of moves. The immediate is a /// 32-bit offset from the PC. These are only valid in x86-32 mode. let SchedRW = [WriteALU] in { +let mayLoad = 1 in { def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src), - "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>, + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src), - "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize, + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize, Requires<[In32BitMode]>; def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src), - "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>, + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; +} +let mayStore = 1 in { def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins), - "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>, + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins), - "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize, + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize, Requires<[In32BitMode]>; def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins), - "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>, + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, Requires<[In32BitMode]>; } +} -// FIXME: These definitions are utterly broken -// Just leave them commented out for now because they're useless outside -// of the large code model, and most compilers won't generate the instructions -// in question. -/* -def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src), - "mov{q}\t{$src, %rax|RAX, $src}", []>; -def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src), - "mov{q}\t{$src, %rax|RAX, $src}", []>; -def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins), - "mov{q}\t{%rax, $dst|$dst, RAX}", []>; -def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins), - "mov{q}\t{%rax, $dst|$dst, RAX}", []>; -*/ - +// These forms all have full 64-bit absolute addresses in their instructions +// and use the movabs mnemonic to indicate this specific form. +let mayLoad = 1 in { +def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset8:$src), + "movabs{b}\t{$src, %al|al, $src}", []>, + Requires<[In64BitMode]>; +def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset16:$src), + "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize, + Requires<[In64BitMode]>; +def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset32:$src), + "movabs{l}\t{$src, %eax|eax, $src}", []>, + Requires<[In64BitMode]>; +def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src), + "movabs{q}\t{$src, %rax|rax, $src}", []>, + Requires<[In64BitMode]>; +} + +let mayStore = 1 in { +def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset8:$dst), (ins), + "movabs{b}\t{%al, $dst|$dst, al}", []>, + Requires<[In64BitMode]>; +def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset16:$dst), (ins), + "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize, + Requires<[In64BitMode]>; +def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset32:$dst), (ins), + "movabs{l}\t{%eax, $dst|$dst, eax}", []>, + Requires<[In64BitMode]>; +def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins), + "movabs{q}\t{%rax, $dst|$dst, rax}", []>, + Requires<[In64BitMode]>; +} +} // hasSideEffects = 0 let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), @@ -1127,7 +1229,7 @@ def MOV8rr_NOREX : I<0x88, MRMDestReg, (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>, Sched<[WriteMove]>; -let mayStore = 1 in +let mayStore = 1, neverHasSideEffects = 1 in def MOV8mr_NOREX : I<0x88, MRMDestMem, (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], @@ -1408,17 +1510,17 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), // Swap between EAX and other registers. def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), - "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize; + "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize; def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), - "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, Requires<[In32BitMode]>; // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding. // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP. def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src), - "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>, + "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, Requires<[In64BitMode]>; def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src), - "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>; + "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>; } // SchedRW let SchedRW = [WriteALU] in { @@ -1768,6 +1870,30 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in { int_x86_bmi_bzhi_64, loadi64>, VEX_W; } +def : Pat<(X86bzhi GR32:$src1, GR8:$src2), + (BZHI32rr GR32:$src1, + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; +def : Pat<(X86bzhi (loadi32 addr:$src1), GR8:$src2), + (BZHI32rm addr:$src1, + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; +def : Pat<(X86bzhi GR64:$src1, GR8:$src2), + (BZHI64rr GR64:$src1, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; +def : Pat<(X86bzhi (loadi64 addr:$src1), GR8:$src2), + (BZHI64rm addr:$src1, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + +let Predicates = [HasBMI] in { + def : Pat<(X86bextr GR32:$src1, GR32:$src2), + (BEXTR32rr GR32:$src1, GR32:$src2)>; + def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2), + (BEXTR32rm addr:$src1, GR32:$src2)>; + def : Pat<(X86bextr GR64:$src1, GR64:$src2), + (BEXTR64rr GR64:$src1, GR64:$src2)>; + def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2), + (BEXTR64rm addr:$src1, GR64:$src2)>; +} // HasBMI + multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, PatFrag ld_frag> { @@ -1792,6 +1918,134 @@ let Predicates = [HasBMI2] in { } //===----------------------------------------------------------------------===// +// TBM Instructions +// +let Predicates = [HasTBM], Defs = [EFLAGS] in { + +multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + Intrinsic Int, Operand immtype, + SDPatternOperator immoperator> { + def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl), + !strconcat(OpcodeStr, + "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), + [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>, + XOP, XOPA, VEX; + def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, immtype:$cntl), + !strconcat(OpcodeStr, + "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), + [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>, + XOP, XOPA, VEX; +} + +defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32, + int_x86_tbm_bextri_u32, i32imm, imm>; +defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64, + int_x86_tbm_bextri_u64, i64i32imm, + i64immSExt32>, VEX_W; + +multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem, + RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag> { +let hasSideEffects = 0 in { + def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), + []>, XOP, XOP9, VEX_4V; + let mayLoad = 1 in + def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), + []>, XOP, XOP9, VEX_4V; +} +} + +multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr, + Format FormReg, Format FormMem> { + defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem, + loadi32>; + defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem, + loadi64>, VEX_W; +} + +defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>; +defm BLCI : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>; +defm BLCIC : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>; +defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>; +defm BLCS : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>; +defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>; +defm BLSIC : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>; +defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>; +defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>; +} // HasTBM, EFLAGS + +//===----------------------------------------------------------------------===// +// Pattern fragments to auto generate TBM instructions. +//===----------------------------------------------------------------------===// + +let Predicates = [HasTBM] in { + def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)), + (BEXTRI32ri GR32:$src1, imm:$src2)>; + def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)), + (BEXTRI32mi addr:$src1, imm:$src2)>; + def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2), + (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>; + def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2), + (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>; + + // FIXME: patterns for the load versions are not implemented + def : Pat<(and GR32:$src, (add GR32:$src, 1)), + (BLCFILL32rr GR32:$src)>; + def : Pat<(and GR64:$src, (add GR64:$src, 1)), + (BLCFILL64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (not (add GR32:$src, 1))), + (BLCI32rr GR32:$src)>; + def : Pat<(or GR64:$src, (not (add GR64:$src, 1))), + (BLCI64rr GR64:$src)>; + + // Extra patterns because opt can optimize the above patterns to this. + def : Pat<(or GR32:$src, (sub -2, GR32:$src)), + (BLCI32rr GR32:$src)>; + def : Pat<(or GR64:$src, (sub -2, GR64:$src)), + (BLCI64rr GR64:$src)>; + + def : Pat<(and (not GR32:$src), (add GR32:$src, 1)), + (BLCIC32rr GR32:$src)>; + def : Pat<(and (not GR64:$src), (add GR64:$src, 1)), + (BLCIC64rr GR64:$src)>; + + def : Pat<(xor GR32:$src, (add GR32:$src, 1)), + (BLCMSK32rr GR32:$src)>; + def : Pat<(xor GR64:$src, (add GR64:$src, 1)), + (BLCMSK64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (add GR32:$src, 1)), + (BLCS32rr GR32:$src)>; + def : Pat<(or GR64:$src, (add GR64:$src, 1)), + (BLCS64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (add GR32:$src, -1)), + (BLSFILL32rr GR32:$src)>; + def : Pat<(or GR64:$src, (add GR64:$src, -1)), + (BLSFILL64rr GR64:$src)>; + + def : Pat<(or (not GR32:$src), (add GR32:$src, -1)), + (BLSIC32rr GR32:$src)>; + def : Pat<(or (not GR64:$src), (add GR64:$src, -1)), + (BLSIC64rr GR64:$src)>; + + def : Pat<(or (not GR32:$src), (add GR32:$src, 1)), + (T1MSKC32rr GR32:$src)>; + def : Pat<(or (not GR64:$src), (add GR64:$src, 1)), + (T1MSKC64rr GR64:$src)>; + + def : Pat<(and (not GR32:$src), (add GR32:$src, -1)), + (TZMSK32rr GR32:$src)>; + def : Pat<(and (not GR64:$src), (add GR64:$src, -1)), + (TZMSK64rr GR64:$src)>; +} // HasTBM + +//===----------------------------------------------------------------------===// // Subsystems. //===----------------------------------------------------------------------===// @@ -1815,6 +2069,7 @@ include "X86InstrXOP.td" // SSE, MMX and 3DNow! vector support. include "X86InstrSSE.td" +include "X86InstrAVX512.td" include "X86InstrMMX.td" include "X86Instr3DNow.td" @@ -1970,75 +2225,83 @@ def : InstAlias<"aad", (AAD8i8 10)>; def : InstAlias<"aam", (AAM8i8 10)>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. -def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>; +// Likewise for btc/btr/bts. +def : InstAlias<"bt {$imm, $mem|$mem, $imm}", + (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"btc {$imm, $mem|$mem, $imm}", + (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"btr {$imm, $mem|$mem, $imm}", + (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"bts {$imm, $mem|$mem, $imm}", + (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>; // clr aliases. -def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>; -def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>; -def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>; -def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>; +def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; +def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; +def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; +def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; // div and idiv aliases for explicit A register. -def : InstAlias<"divb $src, %al", (DIV8r GR8 :$src)>; -def : InstAlias<"divw $src, %ax", (DIV16r GR16:$src)>; -def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>; -def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>; -def : InstAlias<"divb $src, %al", (DIV8m i8mem :$src)>; -def : InstAlias<"divw $src, %ax", (DIV16m i16mem:$src)>; -def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>; -def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>; -def : InstAlias<"idivb $src, %al", (IDIV8r GR8 :$src)>; -def : InstAlias<"idivw $src, %ax", (IDIV16r GR16:$src)>; -def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>; -def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>; -def : InstAlias<"idivb $src, %al", (IDIV8m i8mem :$src)>; -def : InstAlias<"idivw $src, %ax", (IDIV16m i16mem:$src)>; -def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>; -def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; -def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>; -def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>; -def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; -def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>; -def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>; -def : InstAlias<"fxch", (XCH_F ST1)>; -def : InstAlias<"fcom", (COM_FST0r ST1)>; -def : InstAlias<"fcomp", (COMP_FST0r ST1)>; -def : InstAlias<"fcomi", (COM_FIr ST1)>; -def : InstAlias<"fcompi", (COM_FIPr ST1)>; -def : InstAlias<"fucom", (UCOM_Fr ST1)>; -def : InstAlias<"fucomp", (UCOM_FPr ST1)>; -def : InstAlias<"fucomi", (UCOM_FIr ST1)>; -def : InstAlias<"fucompi", (UCOM_FIPr ST1)>; +def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; +def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; +def : InstAlias<"fxch", (XCH_F ST1), 0>; +def : InstAlias<"fcom", (COM_FST0r ST1), 0>; +def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; +def : InstAlias<"fcomi", (COM_FIr ST1), 0>; +def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; +def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; +def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; +def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; // Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with // gas. multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { - def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), + def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"), (Inst RST:$op), EmitAlias>; - def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), + def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"), (Inst ST0), EmitAlias>; } defm : FpUnaryAlias<"fadd", ADD_FST0r>; defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; defm : FpUnaryAlias<"fsub", SUB_FST0r>; -defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; -defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>; +defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; defm : FpUnaryAlias<"fmul", MUL_FST0r>; defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; defm : FpUnaryAlias<"fdiv", DIV_FST0r>; -defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; -defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>; +defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; defm : FpUnaryAlias<"fcompi", COM_FIPr>; @@ -2048,16 +2311,16 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they // commute. We also allow fdiv[r]p/fsubrp even though they don't commute, // solely because gas supports it. -def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; -def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; -def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>; -def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; -def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; -def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; +def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; // We accept "fnstsw %eax" even though it only writes %ax. -def : InstAlias<"fnstsw %eax", (FNSTSW16r)>; -def : InstAlias<"fnstsw %al" , (FNSTSW16r)>; +def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>; +def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>; def : InstAlias<"fnstsw" , (FNSTSW16r)>; // lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but @@ -2076,12 +2339,12 @@ def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>; def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>; // inb %dx -> inb %al, %dx -def : InstAlias<"inb %dx", (IN8rr)>; -def : InstAlias<"inw %dx", (IN16rr)>; -def : InstAlias<"inl %dx", (IN32rr)>; -def : InstAlias<"inb $port", (IN8ri i8imm:$port)>; -def : InstAlias<"inw $port", (IN16ri i8imm:$port)>; -def : InstAlias<"inl $port", (IN32ri i8imm:$port)>; +def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; +def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; +def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; +def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>; // jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp @@ -2109,7 +2372,7 @@ def : InstAlias<"movq $src, $dst", // movsd with no operands (as opposed to the SSE scalar move of a double) is an // alias for movsl. (as in rep; movsd) -def : InstAlias<"movsd", (MOVSD)>; +def : InstAlias<"movsd", (MOVSD), 0>; // movsx aliases def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; @@ -2130,12 +2393,12 @@ def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx -def : InstAlias<"outb %dx", (OUT8rr)>; -def : InstAlias<"outw %dx", (OUT16rr)>; -def : InstAlias<"outl %dx", (OUT32rr)>; -def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>; -def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>; -def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; +def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; +def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; +def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; +def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>; // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same // effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity @@ -2143,19 +2406,19 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>; // shld/shrd op,op -> shld op, op, CL -def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>; -def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>; -def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>; -def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>; -def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>; -def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>; - -def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>; -def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>; -def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>; -def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>; -def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>; -def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>; +def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>; +def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>; + +def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>; +def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>; /* FIXME: This is disabled because the asm matcher is currently incapable of * matching a fixed immediate like $1. @@ -2186,19 +2449,19 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">; FIXME */ // test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. -def : InstAlias<"testb $val, $mem", (TEST8rm GR8 :$val, i8mem :$mem)>; -def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>; -def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>; -def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>; +def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>; // xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. -def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>; -def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>; -def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>; -def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>; +def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>; // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. -def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>; -def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>; -def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; -def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>; +def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; +def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td index 10efad9..ba58143 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -204,7 +204,7 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, //===----------------------------------------------------------------------===// def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", - [(int_x86_mmx_emms)]>; + [(int_x86_mmx_emms)], IIC_MMX_EMMS>; //===----------------------------------------------------------------------===// // MMX Scalar Instructions @@ -236,10 +236,10 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; -let neverHasSideEffects = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), "movd\t{$src, $dst|$dst, $src}", - [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; + [(set VR64:$dst, (bitconvert GR64:$src))], + IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as @@ -250,10 +250,6 @@ def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, "movd\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>; -def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, - (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>; let neverHasSideEffects = 1 in def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), "movq\t{$src, $dst|$dst, $src}", [], @@ -289,7 +285,7 @@ def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (i64 (bitconvert (x86mmx VR64:$src))))))], IIC_MMX_MOVQ_RR>; -let neverHasSideEffects = 1 in +let isCodeGenOnly = 1, hasSideEffects = 1 in { def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOVQ_RR>; @@ -297,6 +293,7 @@ def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOVQ_RR>; +} } // SchedRW def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), @@ -304,21 +301,15 @@ def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)], IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>; -let AddedComplexity = 15 in -// movd to MMX register zero-extends -def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, - (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))], - IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; -let AddedComplexity = 20 in -def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), - (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, - (x86mmx (X86vzmovl (x86mmx - (scalar_to_vector (loadi32 addr:$src))))))], - IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>; +let Predicates = [HasMMX] in { + let AddedComplexity = 15 in + // movd to MMX register zero-extends + def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))), + (MMX_MOVD64rr GR32:$src)>; + let AddedComplexity = 20 in + def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))), + (MMX_MOVD64rm addr:$src)>; +} // Arithmetic Instructions defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b, @@ -358,21 +349,21 @@ defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw, defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b, MMX_INTALU_ITINS>; defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, - MMX_INTALUQ_ITINS, 1>; + MMX_INTALUQ_ITINS>; defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS>; defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w, MMX_PHADDSUBW>; @@ -555,18 +546,18 @@ let Constraints = "$src1 = $dst" in { // Extract / Insert def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, - (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2), - "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1, - (iPTR imm:$src2)))], - IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; + (outs GR32orGR64:$dst), (ins VR64:$src1, i32i8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1, + (iPTR imm:$src2)))], + IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; let Constraints = "$src1 = $dst" in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, (outs VR64:$dst), - (ins VR64:$src1, GR32:$src2, i32i8imm:$src3), + (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, - GR32:$src2, (iPTR imm:$src3)))], + GR32orGR64:$src2, (iPTR imm:$src3)))], IIC_MMX_PINSRW>, Sched<[WriteShuffle]>; def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem, @@ -580,9 +571,10 @@ let Constraints = "$src1 = $dst" in { } // Mask creation -def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src), +def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR64:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, + [(set GR32orGR64:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>; @@ -599,10 +591,10 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), // Misc. let SchedRW = [WriteShuffle] in { let Uses = [EDI] in -def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), - "maskmovq\t{$mask, $src|$src, $mask}", - [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], - IIC_MMX_MASKMOV>; +def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], + IIC_MMX_MASKMOV>; let Uses = [RDI] in def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index cce938b..a5debc0 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -151,6 +151,34 @@ def SSE_MOVU_ITINS : OpndItins< IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM >; +def SSE_DPPD_ITINS : OpndItins< + IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM +>; + +def SSE_DPPS_ITINS : OpndItins< + IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM +>; + +def DEFAULT_ITINS : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +def SSE_EXTRACT_ITINS : OpndItins< + IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM +>; + +def SSE_INSERT_ITINS : OpndItins< + IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM +>; + +def SSE_MPSADBW_ITINS : OpndItins< + IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM +>; + +def SSE_PMULLD_ITINS : OpndItins< + IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM +>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 Instructions Classes //===----------------------------------------------------------------------===// @@ -455,10 +483,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // SSE 1 & 2 - Move FP Scalar Instructions // // Move Instructions. Register-to-register movss/movsd is not used for FR32/64 -// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr -// is used instead. Register-to-register movss/movsd is not modeled as an -// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable -// in terms of a copy, and just mentioned, we don't use movss/movsd for copies. +// register copies because it's a partial register update; Register-to-register +// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires +// that the insert be implementable in terms of a copy, and just mentioned, we +// don't use movss/movsd for copies. //===----------------------------------------------------------------------===// multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, @@ -526,7 +554,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { } // Patterns -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { let AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVS{S,D} to the lower bits. @@ -1074,23 +1102,6 @@ let Predicates = [UseSSE1] in { (MOVUPSmr addr:$dst, VR128:$src)>; } -// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper -// bits are disregarded. FIXME: Set encoding to pseudo! -let neverHasSideEffects = 1, SchedRW = [WriteMove] in { -def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), - "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX; -def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), - "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX; -def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), - "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>; -def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), - "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>; -} - // Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper // bits are disregarded. FIXME: Set encoding to pseudo! let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { @@ -1103,15 +1114,15 @@ let isCodeGenOnly = 1 in { "movapd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (alignedloadfsf64 addr:$src))], IIC_SSE_MOVA_P_RM>, VEX; + def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (alignedloadfsf32 addr:$src))], + IIC_SSE_MOVA_P_RM>; + def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (alignedloadfsf64 addr:$src))], + IIC_SSE_MOVA_P_RM>; } -def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), - "movaps\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (alignedloadfsf32 addr:$src))], - IIC_SSE_MOVA_P_RM>; -def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), - "movapd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (alignedloadfsf64 addr:$src))], - IIC_SSE_MOVA_P_RM>; } //===----------------------------------------------------------------------===// @@ -1327,7 +1338,7 @@ let Predicates = [UseSSE2] in { // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions //===----------------------------------------------------------------------===// -let AddedComplexity = 20 in { +let AddedComplexity = 20, Predicates = [UseAVX] in { def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1358,7 +1369,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; } -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // MOVLHPS patterns def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), (VMOVLHPSrr VR128:$src1, VR128:$src2)>; @@ -1440,7 +1451,7 @@ let neverHasSideEffects = 1 in { multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { -let neverHasSideEffects = 1 in { +let neverHasSideEffects = 1, Predicates = [UseAVX] in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, Sched<[WriteCvtI2F]>; @@ -1452,6 +1463,7 @@ let neverHasSideEffects = 1 in { } // neverHasSideEffects = 1 } +let Predicates = [UseAVX] in { defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, "cvttss2si\t{$src, $dst|$dst, $src}", SSE_CVT_SS2SI_32>, @@ -1485,7 +1497,7 @@ def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; - +} // The assembler can recognize rr 64-bit instructions by seeing a rxx // register, but the same isn't true when only using memory operands, // provide other assembly "l" and "q" forms to address this explicitly @@ -1499,12 +1511,12 @@ defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD, VEX_4V, VEX_W, VEX_LIG; -def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", +let Predicates = [UseAVX] in { + def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>; -def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>; -let Predicates = [HasAVX] in { def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), @@ -1606,19 +1618,21 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +let Predicates = [UseAVX] in { defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_LIG; defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG; - +} defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD; defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W; +let Predicates = [UseAVX] in { defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", SSE_CVT_Scalar, 0>, XS, VEX_4V; @@ -1633,7 +1647,7 @@ defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", SSE_CVT_Scalar, 0>, XD, VEX_4V, VEX_W; - +} let Constraints = "$src1 = $dst" in { defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse_cvtsi2ss, i32mem, loadi32, @@ -1652,6 +1666,7 @@ let Constraints = "$src1 = $dst" in { /// SSE 1 Only // Aliases for intrinsics +let Predicates = [UseAVX] in { defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, ssmem, sse_load_f32, "cvttss2si", SSE_CVT_SS2SI_32>, XS, VEX; @@ -1666,6 +1681,7 @@ defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, "cvttsd2si", SSE_CVT_SD2SI>, XD, VEX, VEX_W; +} defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, ssmem, sse_load_f32, "cvttss2si", SSE_CVT_SS2SI_32>, XS; @@ -1679,13 +1695,14 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W; +let Predicates = [UseAVX] in { defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, ssmem, sse_load_f32, "cvtss2si", SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, ssmem, sse_load_f32, "cvtss2si", SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; - +} defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, ssmem, sse_load_f32, "cvtss2si", SSE_CVT_SS2SI_32>, XS; @@ -1707,6 +1724,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, SSEPackedSingle, SSE_CVT_PS>, TB, Requires<[UseSSE2]>; +let Predicates = [UseAVX] in { def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>; def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", @@ -1723,6 +1741,7 @@ def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>; def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; +} def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", (CVTSS2SIrr GR32:$dst, VR128:$src), 0>; @@ -1744,7 +1763,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", /// SSE 2 Only // Convert scalar double to scalar single -let neverHasSideEffects = 1 in { +let neverHasSideEffects = 1, Predicates = [UseAVX] in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], @@ -1760,7 +1779,7 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), } def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, - Requires<[HasAVX]>; + Requires<[UseAVX]>; def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", @@ -1778,27 +1797,27 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, + IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>, Sched<[WriteCvtF2F]>; def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))], - IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>, + IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), - "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))], IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>, @@ -1807,7 +1826,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, // Convert scalar single to scalar double // SSE2 instructions with XS prefix -let neverHasSideEffects = 1 in { +let neverHasSideEffects = 1, Predicates = [UseAVX] in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1824,16 +1843,16 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), } def : Pat<(f64 (fextend FR32:$src)), - (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; + (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>; def : Pat<(fextend (loadf32 addr:$src)), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>; + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; def : Pat<(extloadf32 addr:$src), (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; + Requires<[UseAVX, OptForSize]>; def : Pat<(extloadf32 addr:$src), (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, - Requires<[HasAVX, OptForSpeed]>; + Requires<[UseAVX, OptForSpeed]>; def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", @@ -1861,14 +1880,14 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>, + IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>, Sched<[WriteCvtF2F]>; def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], - IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>, + IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, @@ -1895,7 +1914,7 @@ def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))], + (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", @@ -1905,7 +1924,7 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))], + (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))], IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", @@ -1934,7 +1953,7 @@ def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dqx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX, + (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX, Sched<[WriteCvtF2ILd]>; // YMM only @@ -1946,7 +1965,7 @@ def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>, + (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", (VCVTPD2DQYrr VR128:$dst, VR256:$src)>; @@ -1972,7 +1991,7 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq - (memopv4f32 addr:$src)))], + (loadv4f32 addr:$src)))], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", @@ -1982,7 +2001,7 @@ def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 - (memopv8f32 addr:$src)))], + (loadv8f32 addr:$src)))], IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; @@ -1999,27 +2018,27 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), let Predicates = [HasAVX] in { def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), (VCVTDQ2PSrm addr:$src)>; def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), (VCVTDQ2PSrm addr:$src)>; def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), (VCVTTPS2DQrr VR128:$src)>; - def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), + def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), (VCVTTPS2DQrm addr:$src)>; def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))), (VCVTDQ2PSYrr VR256:$src)>; - def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))), + def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))), (VCVTDQ2PSYrm addr:$src)>; def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), (VCVTTPS2DQYrr VR256:$src)>; - def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))), + def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), (VCVTTPS2DQYrm addr:$src)>; } @@ -2056,7 +2075,7 @@ def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dqx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (memopv2f64 addr:$src)))], + (loadv2f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; // YMM only @@ -2068,7 +2087,7 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))], + (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>; @@ -2076,7 +2095,7 @@ def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", let Predicates = [HasAVX] in { def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; - def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))), + def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), (VCVTTPD2DQYrm addr:$src)>; } // Predicates = [HasAVX] @@ -2110,7 +2129,7 @@ def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))], + (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))], IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; } @@ -2140,7 +2159,7 @@ def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvtdq2_pd_256 - (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L, + (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtI2FLd]>; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", @@ -2162,7 +2181,7 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), let Predicates = [HasAVX] in { def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PDYrr VR128:$src)>; - def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), (VCVTDQ2PDYrm addr:$src)>; } // Predicates = [HasAVX] @@ -2181,7 +2200,7 @@ def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2psx\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], + (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; // YMM only @@ -2193,7 +2212,7 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))], + (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", (VCVTPD2PSYrr VR128:$dst, VR256:$src)>; @@ -2215,13 +2234,13 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), let Predicates = [HasAVX] in { def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), (VCVTDQ2PSYrr VR256:$src)>; - def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))), + def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))), (VCVTDQ2PSYrm addr:$src)>; // Match fround and fextend for 128/256-bit conversions def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), (VCVTPD2PSrr VR128:$src)>; - def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), + def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))), (VCVTPD2PSXrm addr:$src)>; def : Pat<(v4f32 (fround (v4f64 VR256:$src))), (VCVTPD2PSYrr VR256:$src)>; @@ -2299,7 +2318,7 @@ let Constraints = "$src1 = $dst" in { defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64, "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSE_ALU_F32S>, // same latency as 32 bit compare + SSE_ALU_F64S>, XD; } @@ -2334,7 +2353,7 @@ let Constraints = "$src1 = $dst" in { SSE_ALU_F32S>, XS; defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $dst|$dst, $src}", - SSE_ALU_F32S>, // same latency as f32 + SSE_ALU_F64S>, XD; } @@ -2342,90 +2361,88 @@ let Constraints = "$src1 = $dst" in { // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, - PatFrag ld_frag, string OpcodeStr, Domain d> { - def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + PatFrag ld_frag, string OpcodeStr> { + def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], - IIC_SSE_COMIS_RR, d>, + IIC_SSE_COMIS_RR>, Sched<[WriteFAdd]>; - def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), (ld_frag addr:$src2)))], - IIC_SSE_COMIS_RM, d>, + IIC_SSE_COMIS_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; } let Defs = [EFLAGS] in { defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG; + "ucomiss">, TB, VEX, VEX_LIG; defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", SSEPackedDouble>, TB, OpSize, VEX, - VEX_LIG; + "ucomisd">, TB, OpSize, VEX, VEX_LIG; let Pattern = []<dag> in { defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, TB, VEX, - VEX_LIG; + "comiss">, TB, VEX, VEX_LIG; defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, TB, OpSize, VEX, - VEX_LIG; + "comisd">, TB, OpSize, VEX, VEX_LIG; } defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss", SSEPackedSingle>, TB, VEX; + load, "ucomiss">, TB, VEX; defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX; + load, "ucomisd">, TB, OpSize, VEX; defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, - load, "comiss", SSEPackedSingle>, TB, VEX; + load, "comiss">, TB, VEX; defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, - load, "comisd", SSEPackedDouble>, TB, OpSize, VEX; + load, "comisd">, TB, OpSize, VEX; defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", SSEPackedSingle>, TB; + "ucomiss">, TB; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", SSEPackedDouble>, TB, OpSize; + "ucomisd">, TB, OpSize; let Pattern = []<dag> in { defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, TB; + "comiss">, TB; defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, TB, OpSize; + "comisd">, TB, OpSize; } defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss", SSEPackedSingle>, TB; + load, "ucomiss">, TB; defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd", SSEPackedDouble>, TB, OpSize; + load, "ucomisd">, TB, OpSize; defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, TB; + "comiss">, TB; defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, TB, OpSize; + "comisd">, TB, OpSize; } // Defs = [EFLAGS] // sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, Operand CC, Intrinsic Int, string asm, - string asm_alt, Domain d> { + string asm_alt, Domain d, + OpndItins itins = SSE_ALU_F32P> { def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], - IIC_SSE_CMPP_RR, d>, + itins.rr, d>, Sched<[WriteFAdd]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], - IIC_SSE_CMPP_RM, d>, + itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let neverHasSideEffects = 1 in { def rri_alt : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>; + asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>; def rmi_alt : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_CMPP_RM, d>, + asm_alt, [], itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; } } @@ -2450,11 +2467,11 @@ let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedSingle>, TB; + SSEPackedSingle, SSE_ALU_F32P>, TB; defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedDouble>, TB, OpSize; + SSEPackedDouble, SSE_ALU_F64P>, TB, OpSize; } let Predicates = [HasAVX] in { @@ -2514,16 +2531,16 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv4f32, SSEPackedSingle>, TB, VEX_4V; + loadv4f32, SSEPackedSingle>, TB, VEX_4V; defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L; + loadv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L; defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, - "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, - "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -2538,13 +2555,13 @@ let Constraints = "$src1 = $dst" in { let Predicates = [HasAVX] in { def : Pat<(v4i32 (X86Shufp VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), + (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))), (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v2i64 (X86Shufp VR128:$src1, - (memopv2i64 addr:$src2), (i8 imm:$imm))), + (loadv2i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; @@ -2553,13 +2570,13 @@ let Predicates = [HasAVX] in { def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v8i32 (X86Shufp VR256:$src1, - (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v4i64 (X86Shufp VR256:$src1, - (memopv4i64 addr:$src2), (i8 imm:$imm))), + (loadv4i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; } @@ -2603,29 +2620,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, Sched<[WriteShuffleLd, ReadAfterLd]>; } -defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, +defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, TB, VEX_4V; -defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, +defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, TB, OpSize, VEX_4V; -defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, +defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, TB, VEX_4V; -defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, +defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, TB, OpSize, VEX_4V; -defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32, +defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, TB, VEX_4V, VEX_L; -defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64, +defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; -defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32, +defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, TB, VEX_4V, VEX_L; -defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64, +defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L; @@ -2645,20 +2662,20 @@ let Constraints = "$src1 = $dst" in { } // Constraints = "$src1 = $dst" let Predicates = [HasAVX1Only] in { - def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))), + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))), + def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))), + def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; @@ -2689,13 +2706,10 @@ let Predicates = [UseSSE2] in { /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, Domain d> { - def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>, - Sched<[WriteVecLogic]>; - def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], - IIC_SSE_MOVMSK, d>, REX_W, Sched<[WriteVecLogic]>; + def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>, + Sched<[WriteVecLogic]>; } let Predicates = [HasAVX] in { @@ -2712,29 +2726,15 @@ let Predicates = [HasAVX] in { OpSize, VEX, VEX_L; def : Pat<(i32 (X86fgetsign FR32:$src)), - (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>; + (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; def : Pat<(i64 (X86fgetsign FR32:$src)), - (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>; + (SUBREG_TO_REG (i64 0), + (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>; def : Pat<(i32 (X86fgetsign FR64:$src)), - (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>; + (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>; def : Pat<(i64 (X86fgetsign FR64:$src)), - (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>; - - // Assembler Only - def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, - SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>; - def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, - SSEPackedDouble>, TB, - OpSize, VEX, Sched<[WriteVecLogic]>; - def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, - SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>; - def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK, - SSEPackedDouble>, TB, - OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>; + (SUBREG_TO_REG (i64 0), + (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>; } defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", @@ -2743,16 +2743,18 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", SSEPackedDouble>, TB, OpSize; def : Pat<(i32 (X86fgetsign FR32:$src)), - (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>, + (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>, Requires<[UseSSE1]>; def : Pat<(i64 (X86fgetsign FR32:$src)), - (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>, + (SUBREG_TO_REG (i64 0), + (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>, Requires<[UseSSE1]>; def : Pat<(i32 (X86fgetsign FR64:$src)), - (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>, + (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>, Requires<[UseSSE2]>; def : Pat<(i64 (X86fgetsign FR64:$src)), - (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>, + (SUBREG_TO_REG (i64 0), + (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>, Requires<[UseSSE2]>; //===---------------------------------------------------------------------===// @@ -2791,7 +2793,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, OpndItins itins, bit IsCommutable = 0> { let Predicates = [HasAVX] in defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, - VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; + VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, @@ -2799,7 +2801,7 @@ let Constraints = "$src1 = $dst" in let Predicates = [HasAVX2] in defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, - OpVT256, VR256, memopv4i64, i256mem, itins, + OpVT256, VR256, loadv4i64, i256mem, itins, IsCommutable, 0>, VEX_4V, VEX_L; } @@ -2839,16 +2841,18 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, } // Alias bitwise logical operations using SSE logical ops on packed FP values. -defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, - SSE_BIT_ITINS_P>; -defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, - SSE_BIT_ITINS_P>; -defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, - SSE_BIT_ITINS_P>; - -let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in - defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, +let isCodeGenOnly = 1 in { + defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, SSE_BIT_ITINS_P>; + defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, + SSE_BIT_ITINS_P>; + defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; + + let isCommutable = 0 in + defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn, + SSE_BIT_ITINS_P>; +} /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops /// @@ -2858,14 +2862,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "ps"), f256mem, [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), - (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L; + (loadv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L; defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f256mem, [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), (bc_v4i64 (v4f64 VR256:$src2))))], [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), - (memopv4i64 addr:$src2)))], 0>, + (loadv4i64 addr:$src2)))], 0>, TB, OpSize, VEX_4V, VEX_L; // In AVX no need to add a pattern for 128-bit logical rr ps, because they @@ -2875,14 +2879,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, [], [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V; + (loadv2i64 addr:$src2)))], 0>, TB, VEX_4V; defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f128mem, [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), (bc_v2i64 (v2f64 VR128:$src2))))], [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), - (memopv2i64 addr:$src2)))], 0>, + (loadv2i64 addr:$src2)))], 0>, TB, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { @@ -2924,120 +2928,93 @@ let isCommutable = 0 in /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those /// classes below -multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - SizeItins itins, - bit Is2Addr = 1> { - defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), - OpNode, FR32, f32mem, - itins.s, Is2Addr>, XS; - defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), - OpNode, FR64, f64mem, - itins.d, Is2Addr>, XD; -} - multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { -let Predicates = [HasAVX] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - VR128, v4f32, f128mem, memopv4f32, + VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins.s, 0>, TB, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - VR128, v2f64, f128mem, memopv2f64, + VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V; defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), - OpNode, VR256, v8f32, f256mem, memopv8f32, + OpNode, VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L; defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), - OpNode, VR256, v4f64, f256mem, memopv4f64, + OpNode, VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L; -} -let Constraints = "$src1 = $dst" in { - defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, - v4f32, f128mem, memopv4f32, SSEPackedSingle, - itins.s, 1>, TB; - defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, - v2f64, f128mem, memopv2f64, SSEPackedDouble, - itins.d, 1>, TB, OpSize; -} -} - -multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, - SizeItins itins, - bit Is2Addr = 1> { - defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, - itins.s, Is2Addr>, XS; - defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, - itins.d, Is2Addr>, XD; + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, + v4f32, f128mem, memopv4f32, SSEPackedSingle, + itins.s>, TB; + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, + v2f64, f128mem, memopv2f64, SSEPackedDouble, + itins.d>, TB, OpSize; + } } -// Binary Arithmetic instructions -defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>; -defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>; -let isCommutable = 0 in { - defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>; - defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>; - defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>; - defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>; -} +multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins> { + defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), + OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG; + defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), + OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG; -let isCodeGenOnly = 1 in { - defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>; - defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>; + let Constraints = "$src1 = $dst" in { + defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), + OpNode, FR32, f32mem, itins.s>, XS; + defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), + OpNode, FR64, f64mem, itins.d>, XD; + } } -defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>, - basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>, - VEX_4V, VEX_LIG; -defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>, - basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>, - VEX_4V, VEX_LIG; +multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, + SizeItins itins> { + defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, + itins.s, 0>, XS, VEX_4V, VEX_LIG; + defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, + itins.d, 0>, XD, VEX_4V, VEX_LIG; -let isCommutable = 0 in { - defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>, - basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>, - VEX_4V, VEX_LIG; - defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>, - basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>, - VEX_4V, VEX_LIG; - defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>, - basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>, - VEX_4V, VEX_LIG; - defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>, - basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>, - VEX_4V, VEX_LIG; + let Constraints = "$src1 = $dst" in { + defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, + itins.s>, XS; + defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, + itins.d>, XD; + } } -let Constraints = "$src1 = $dst" in { - defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; - defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, - basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; - - let isCommutable = 0 in { - defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; - defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; - defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>; - defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>; - } +// Binary Arithmetic instructions +defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; +defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>, + basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, + basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; +let isCommutable = 0 in { + defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; + defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, + basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; + defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>; + defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>; } let isCodeGenOnly = 1 in { - defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>, - VEX_4V, VEX_LIG; - defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>, - VEX_4V, VEX_LIG; - let Constraints = "$src1 = $dst" in { - defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>; - defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>; - } + defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>; + defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>; } /// Unop Arithmetic @@ -3049,12 +3026,20 @@ let isCodeGenOnly = 1 in { /// And, we have a special variant form for a full-vector intrinsic form. let Sched = WriteFSqrt in { -def SSE_SQRTP : OpndItins< - IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM +def SSE_SQRTPS : OpndItins< + IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM +>; + +def SSE_SQRTSS : OpndItins< + IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM >; -def SSE_SQRTS : OpndItins< - IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM +def SSE_SQRTPD : OpndItins< + IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM +>; + +def SSE_SQRTSD : OpndItins< + IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM >; } @@ -3175,7 +3160,7 @@ let Predicates = [HasAVX] in { def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], + [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))], itins.rm>, VEX, Sched<[itins.Sched.Folded]>; def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat("v", OpcodeStr, @@ -3185,7 +3170,7 @@ let Predicates = [HasAVX] in { def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))], + [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))], itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; } @@ -3212,7 +3197,7 @@ let Predicates = [HasAVX] in { def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))], + [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))], itins.rm>, VEX, Sched<[itins.Sched.Folded]>; def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat("v", OpcodeStr, @@ -3223,7 +3208,7 @@ let Predicates = [HasAVX] in { (ins f256mem:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))], + [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))], itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; } @@ -3293,7 +3278,7 @@ let Predicates = [HasAVX] in { def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], + [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))], itins.rm>, VEX, Sched<[itins.Sched.Folded]>; def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat("v", OpcodeStr, @@ -3303,7 +3288,7 @@ let Predicates = [HasAVX] in { def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))], + [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))], itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; } @@ -3319,47 +3304,48 @@ let Predicates = [HasAVX] in { // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, - SSE_SQRTS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>, + SSE_SQRTSS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, - SSE_SQRTS>, - sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>; + SSE_SQRTSD>, + sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>, +defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - int_x86_avx_rsqrt_ps_256, SSE_SQRTP>; + int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>; defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, int_x86_avx_rcp_ps_256, SSE_RCPP>; -def : Pat<(f32 (fsqrt FR32:$src)), - (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; -def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; -def : Pat<(f64 (fsqrt FR64:$src)), - (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; -def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - -def : Pat<(f32 (X86frsqrt FR32:$src)), - (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; -def : Pat<(f32 (X86frsqrt (load addr:$src))), - (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - -def : Pat<(f32 (X86frcp FR32:$src)), - (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; -def : Pat<(f32 (X86frcp (load addr:$src))), - (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { + def : Pat<(f32 (fsqrt FR32:$src)), + (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; + def : Pat<(f32 (fsqrt (load addr:$src))), + (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + def : Pat<(f64 (fsqrt FR64:$src)), + (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; + def : Pat<(f64 (fsqrt (load addr:$src))), + (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + + def : Pat<(f32 (X86frsqrt FR32:$src)), + (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; + def : Pat<(f32 (X86frsqrt (load addr:$src))), + (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + + def : Pat<(f32 (X86frcp FR32:$src)), + (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; + def : Pat<(f32 (X86frcp (load addr:$src))), + (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; +} +let Predicates = [UseAVX] in { def : Pat<(int_x86_sse_sqrt_ss VR128:$src), (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS VR128:$src, FR32)), @@ -3373,7 +3359,9 @@ let Predicates = [HasAVX] in { VR128)>; def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; +} +let Predicates = [HasAVX] in { def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS VR128:$src, FR32)), @@ -3657,7 +3645,7 @@ def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), XS, Requires<[UseSSE2]>; } -let mayStore = 1, SchedRW = [WriteStore] in { +let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], @@ -3720,7 +3708,7 @@ multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, bit IsCommutable = 0> { let Predicates = [HasAVX] in defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128, - VR128, memopv2i64, i128mem, itins, + VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; let Constraints = "$src1 = $dst" in @@ -3729,7 +3717,7 @@ let Constraints = "$src1 = $dst" in let Predicates = [HasAVX2] in defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256, - VR256, memopv4i64, i256mem, itins, + VR256, loadv4i64, i256mem, itins, IsCommutable, 0>, VEX_4V, VEX_L; } @@ -3756,11 +3744,11 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), - (ins RC:$src1, i32i8imm:$src2), + (ins RC:$src1, i8imm:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>, + [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>, Sched<[WriteVecShift]>; } @@ -3844,15 +3832,15 @@ defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, - int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>; + int_x86_avx2_psad_bw, SSE_PMADD, 1>; let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, - memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; let Predicates = [HasAVX2] in defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, - VR256, memopv4i64, i256mem, + VR256, loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, @@ -3988,12 +3976,14 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), "pslldq\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>; + (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))], + IIC_SSE_INTSHDQ_P_RI>; def PSRLDQri : PDIi8<0x73, MRM3r, (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), "psrldq\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>; + (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))], + IIC_SSE_INTSHDQ_P_RI>; // PSRADQri doesn't exist in SSE[1-3]. } } // Constraints = "$src1 = $dst" @@ -4077,14 +4067,14 @@ let Predicates = [HasAVX] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF>, VEX, Sched<[WriteShuffle]>; + IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>; def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, + (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, Sched<[WriteShuffleLd]>; } @@ -4095,14 +4085,14 @@ let Predicates = [HasAVX2] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF>, VEX, VEX_L, Sched<[WriteShuffle]>; + IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>; def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L, + (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L, Sched<[WriteShuffleLd]>; } @@ -4113,14 +4103,14 @@ let Predicates = [UseSSE2] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF>, Sched<[WriteShuffle]>; + IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>; def mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF>, + (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, Sched<[WriteShuffleLd]>; } } @@ -4131,7 +4121,7 @@ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS; defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD; let Predicates = [HasAVX] in { - def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), + def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))), (VPSHUFDmi addr:$src1, imm:$imm)>; def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), (VPSHUFDri VR128:$src1, imm:$imm)>; @@ -4254,13 +4244,13 @@ let ExeDomain = SSEPackedInt in { multiclass sse2_pinsrw<bit Is2Addr = 1> { def rri : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, - GR32:$src2, i32i8imm:$src3), + GR32orGR64:$src2, i32i8imm:$src3), !if(Is2Addr, "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>, - Sched<[WriteShuffle]>; + (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))], + IIC_SSE_PINSRW>, Sched<[WriteShuffle]>; def rmi : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, i32i8imm:$src3), @@ -4276,29 +4266,24 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { // Extract let Predicates = [HasAVX] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, - (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), - imm:$src2))]>, TB, OpSize, VEX, + [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))]>, TB, OpSize, VEX, Sched<[WriteShuffle]>; def PEXTRWri : PDIi8<0xC5, MRMSrcReg, - (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), - imm:$src2))], IIC_SSE_PEXTRW>, + [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))], IIC_SSE_PEXTRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; // Insert -let Predicates = [HasAVX] in { - defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V; - def VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), - "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, TB, OpSize, VEX_4V, Sched<[WriteShuffle]>; -} +let Predicates = [HasAVX] in +defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V; -let Constraints = "$src1 = $dst" in - defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>; +let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in +defm PINSRW : sse2_pinsrw, TB, OpSize; } // ExeDomain = SSEPackedInt @@ -4308,24 +4293,23 @@ let Constraints = "$src1 = $dst" in let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { -def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), +def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], IIC_SSE_MOVMSK>, VEX; -def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX; let Predicates = [HasAVX2] in { -def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src), +def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR256:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L; -def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; + [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, + VEX, VEX_L; } -def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), +def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], IIC_SSE_MOVMSK>; } // ExeDomain = SSEPackedInt @@ -4336,25 +4320,25 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { -let Uses = [EDI] in +let Uses = [EDI], Predicates = [HasAVX,In32BitMode] in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], IIC_SSE_MASKMOV>, VEX; -let Uses = [RDI] in +let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], IIC_SSE_MASKMOV>, VEX; -let Uses = [EDI] in +let Uses = [EDI], Predicates = [UseSSE2,In32BitMode] in def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], IIC_SSE_MASKMOV>; -let Uses = [RDI] in +let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], @@ -4369,43 +4353,45 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), //===---------------------------------------------------------------------===// // Move Int Doubleword to Packed Double Int // -def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), +def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), +def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; -def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", +def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", +let isCodeGenOnly = 1 in +def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), +def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; -def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), +def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; -def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), +def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; -def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), +let isCodeGenOnly = 1 in +def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; @@ -4413,63 +4399,77 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), //===---------------------------------------------------------------------===// // Move Int Doubleword to Single Scalar // -def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert GR32:$src))], - IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; - -def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], - IIC_SSE_MOVDQ>, - VEX, Sched<[WriteLoad]>; -def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert GR32:$src))], - IIC_SSE_MOVDQ>, Sched<[WriteMove]>; - -def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], - IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; +let isCodeGenOnly = 1 in { + def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; + + def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, + VEX, Sched<[WriteLoad]>; + def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, Sched<[WriteMove]>; + + def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; +} //===---------------------------------------------------------------------===// // Move Packed Doubleword Int to Packed Double Int // -def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), +def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; -def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs), +def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; -def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), +def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; -def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), +def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; +def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; + +def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; + +def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; + +def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; + //===---------------------------------------------------------------------===// // Move Packed Doubleword Int first element to Doubleword Int // let SchedRW = [WriteMove] in { -def VMOVPQIto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", +def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX; -def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), +def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), (iPTR 0)))], @@ -4479,121 +4479,112 @@ def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), //===---------------------------------------------------------------------===// // Bitcast FR64 <-> GR64 // -let Predicates = [HasAVX] in -def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, - VEX, Sched<[WriteLoad]>; -def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), +let isCodeGenOnly = 1 in { + let Predicates = [UseAVX] in + def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, + VEX, Sched<[WriteLoad]>; + def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; + def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; + + def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; + def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))], - IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; -def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; + def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (bitconvert FR64:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; - -def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), - "movq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], - IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; -def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (bitconvert FR64:$src))], - IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; -def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), - "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, Sched<[WriteStore]>; + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; +} //===---------------------------------------------------------------------===// // Move Scalar Single to Double Int // -def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (bitconvert FR32:$src))], - IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; -def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; -def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (bitconvert FR32:$src))], - IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; -def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (bitconvert FR32:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, Sched<[WriteStore]>; +let isCodeGenOnly = 1 in { + def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))], + IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; + def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; + def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; + def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; +} //===---------------------------------------------------------------------===// // Patterns and instructions to describe movd/movq to XMM register zero-extends // -let SchedRW = [WriteMove] in { +let isCodeGenOnly = 1, SchedRW = [WriteMove] in { let AddedComplexity = 15 in { -def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector GR32:$src)))))], - IIC_SSE_MOVDQ>, VEX; -def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only +def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", // X86-64 only [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))))], IIC_SSE_MOVDQ>, VEX, VEX_W; -} -let AddedComplexity = 15 in { -def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector GR32:$src)))))], - IIC_SSE_MOVDQ>; -def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), +def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))))], IIC_SSE_MOVDQ>; } -} // SchedRW +} // isCodeGenOnly, SchedRW -let AddedComplexity = 20, SchedRW = [WriteLoad] in { -def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v4i32 (X86vzmovl (v4i32 (scalar_to_vector - (loadi32 addr:$src))))))], - IIC_SSE_MOVDQ>, VEX; -def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v4i32 (X86vzmovl (v4i32 (scalar_to_vector - (loadi32 addr:$src))))))], - IIC_SSE_MOVDQ>; -} // AddedComplexity, SchedRW +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (VMOVDI2PDIrr GR32:$src)>; -let Predicates = [HasAVX] in { // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. let AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), - (VMOVZDI2PDIrm addr:$src)>; + (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), - (VMOVZDI2PDIrm addr:$src)>; + (VMOVDI2PDIrm addr:$src)>; } // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; } -let Predicates = [UseSSE2], AddedComplexity = 20 in { - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), - (MOVZDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), - (MOVZDI2PDIrm addr:$src)>; +let Predicates = [UseSSE2] in { + let AddedComplexity = 15 in + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (MOVDI2PDIrr GR32:$src)>; + + let AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (MOVDI2PDIrm addr:$src)>; + } } // These are the correct encodings of the instructions so that we know how to @@ -4602,15 +4593,12 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in { def : InstAlias<"movq\t{$src, $dst|$dst, $src}", (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; def : InstAlias<"movq\t{$src, $dst|$dst, $src}", - (MOV64toSDrr FR64:$dst, GR64:$src), 0>; -def : InstAlias<"movq\t{$src, $dst|$dst, $src}", (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; -def : InstAlias<"movq\t{$src, $dst|$dst, $src}", - (MOVSDto64rr GR64:$dst, FR64:$src), 0>; -def : InstAlias<"movq\t{$src, $dst|$dst, $src}", - (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>; -def : InstAlias<"movq\t{$src, $dst|$dst, $src}", - (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>; +// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. +def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", + (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", + (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; //===---------------------------------------------------------------------===// // SSE2 - Move Quadword @@ -4625,7 +4613,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, - VEX, Requires<[HasAVX]>; + VEX, Requires<[UseAVX]>; def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -4638,12 +4626,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), // Move Packed Quadword Int to Quadword Int // let SchedRW = [WriteStore] in { -def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), +def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX; -def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), +def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], @@ -4653,25 +4641,24 @@ def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), //===---------------------------------------------------------------------===// // Store / copy lower 64-bits of a XMM register. // -def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), +def VMOVLQ128mr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX, Sched<[WriteStore]>; -def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), +def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; -let AddedComplexity = 20 in +let isCodeGenOnly = 1, AddedComplexity = 20 in { def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, - XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>; + XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>; -let AddedComplexity = 20 in def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -4679,10 +4666,9 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; +} -let Predicates = [HasAVX], AddedComplexity = 20 in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVZQI2PQIrm addr:$src)>; +let Predicates = [UseAVX], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), (VMOVZQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), @@ -4690,8 +4676,6 @@ let Predicates = [HasAVX], AddedComplexity = 20 in { } let Predicates = [UseSSE2], AddedComplexity = 20 in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (MOVZQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), (MOVZQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; @@ -4714,7 +4698,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], IIC_SSE_MOVQ_RR>, - XS, VEX, Requires<[HasAVX]>; + XS, VEX, Requires<[UseAVX]>; let AddedComplexity = 15 in def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", @@ -4723,14 +4707,14 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), XS, Requires<[UseSSE2]>; } // SchedRW -let SchedRW = [WriteVecLogicLd] in { +let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (loadv2i64 addr:$src))))], IIC_SSE_MOVDQ>, - XS, VEX, Requires<[HasAVX]>; + XS, VEX, Requires<[UseAVX]>; let AddedComplexity = 20 in { def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movq\t{$src, $dst|$dst, $src}", @@ -4739,49 +4723,19 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>; } -} // SchedRW +} // isCodeGenOnly, SchedRW let AddedComplexity = 20 in { - let Predicates = [HasAVX] in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVZPQILo2PQIrm addr:$src)>; + let Predicates = [UseAVX] in { def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (VMOVZPQILo2PQIrr VR128:$src)>; } let Predicates = [UseSSE2] in { - def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (MOVZPQILo2PQIrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (MOVZPQILo2PQIrr VR128:$src)>; } } -// Instructions to match in the assembler -let SchedRW = [WriteMove] in { -def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "movq\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVDQ>, VEX, VEX_W; -def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "movq\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVDQ>, VEX, VEX_W; -// Recognize "movd" with GR64 destination, but encode as a "movq" -def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "movd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVDQ>, VEX, VEX_W; -} // SchedRW - -// Instructions for the disassembler -// xr = XMM register -// xm = mem64 - -let SchedRW = [WriteMove] in { -let Predicates = [HasAVX] in -def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; -def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS; -} // SchedRW - //===---------------------------------------------------------------------===// // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// @@ -4800,13 +4754,13 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), let Predicates = [HasAVX] in { defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v4f32, VR128, memopv4f32, f128mem>, VEX; + v4f32, VR128, loadv4f32, f128mem>, VEX; defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v4f32, VR128, memopv4f32, f128mem>, VEX; + v4f32, VR128, loadv4f32, f128mem>, VEX; defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L; + v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L; + v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; } defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, memopv4f32, f128mem>; @@ -4816,19 +4770,19 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, let Predicates = [HasAVX] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (VMOVSHDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), (VMOVSHDUPrm addr:$src)>; def : Pat<(v4i32 (X86Movsldup VR128:$src)), (VMOVSLDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), (VMOVSLDUPrm addr:$src)>; def : Pat<(v8i32 (X86Movshdup VR256:$src)), (VMOVSHDUPYrr VR256:$src)>; - def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))), + def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), (VMOVSHDUPYrm addr:$src)>; def : Pat<(v8i32 (X86Movsldup VR256:$src)), (VMOVSLDUPYrr VR256:$src)>; - def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))), + def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), (VMOVSLDUPYrm addr:$src)>; } @@ -4882,20 +4836,20 @@ let Predicates = [HasAVX] in { defm MOVDDUP : sse3_replicate_dfp<"movddup">; let Predicates = [HasAVX] in { - def : Pat<(X86Movddup (memopv2f64 addr:$src)), + def : Pat<(X86Movddup (loadv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), + def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), + def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; // 256-bit version - def : Pat<(X86Movddup (memopv4f64 addr:$src)), + def : Pat<(X86Movddup (loadv4f64 addr:$src)), (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (memopv4i64 addr:$src)), + def : Pat<(X86Movddup (loadv4i64 addr:$src)), (VMOVDDUPYrm addr:$src)>; def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), (VMOVDDUPYrm addr:$src)>; @@ -5097,12 +5051,12 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), VR128:$src))>; -def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i32 15)))>; -def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i32 31)))>; +def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>; +def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>; def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), VR256:$src))>; -def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i32 15)))>; -def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i32 31)))>; +def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>; +def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>; let Predicates = [HasAVX] in { defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", @@ -5258,34 +5212,34 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntId256 VR256:$src1, - (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; + (bitconvert (loadv4i64 addr:$src2))))]>, OpSize; } let ImmT = NoImm, Predicates = [HasAVX] in { let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128, - memopv2i64, i128mem, + loadv2i64, i128mem, SSE_PHADDSUBW, 0>, VEX_4V; defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128, - memopv2i64, i128mem, + loadv2i64, i128mem, SSE_PHADDSUBD, 0>, VEX_4V; defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128, - memopv2i64, i128mem, + loadv2i64, i128mem, SSE_PHADDSUBW, 0>, VEX_4V; defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128, - memopv2i64, i128mem, + loadv2i64, i128mem, SSE_PHADDSUBD, 0>, VEX_4V; defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128, - memopv2i64, i128mem, + loadv2i64, i128mem, SSE_PSIGN, 0>, VEX_4V; defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128, - memopv2i64, i128mem, + loadv2i64, i128mem, SSE_PSIGN, 0>, VEX_4V; defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128, - memopv2i64, i128mem, + loadv2i64, i128mem, SSE_PSIGN, 0>, VEX_4V; defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128, - memopv2i64, i128mem, + loadv2i64, i128mem, SSE_PSHUFB, 0>, VEX_4V; defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, @@ -5305,28 +5259,28 @@ defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", let ImmT = NoImm, Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256, - memopv4i64, i256mem, + loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256, - memopv4i64, i256mem, + loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256, - memopv4i64, i256mem, + loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256, - memopv4i64, i256mem, + loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256, - memopv4i64, i256mem, + loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256, - memopv4i64, i256mem, + loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256, - memopv4i64, i256mem, + loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, - memopv4i64, i256mem, + loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", int_x86_avx2_phadd_sw>, VEX_4V, VEX_L; @@ -5384,7 +5338,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>; + [], IIC_SSE_PALIGNRR>, OpSize, Sched<[WriteShuffle]>; let mayLoad = 1 in def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), @@ -5392,7 +5346,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>; + [], IIC_SSE_PALIGNRM>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>; } } @@ -5472,28 +5426,29 @@ def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", TB, Requires<[HasSSE3]>; } // SchedRW -def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>; -def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>; +def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>; +def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; -def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>, +def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, Requires<[In32BitMode]>; -def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>, +def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // SSE4.1 - Packed Move with Sign/Zero Extend //===----------------------------------------------------------------------===// -multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> { +multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId, + OpndItins itins = DEFAULT_ITINS> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize; def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, - OpSize; + (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], + itins.rm>, OpSize; } multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr, @@ -5504,22 +5459,23 @@ multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr, def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize; + [(set VR256:$dst, (IntId (load addr:$src)))]>, + OpSize; } let Predicates = [HasAVX] in { -defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, - VEX; -defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>, - VEX; -defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>, - VEX; -defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>, - VEX; -defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>, - VEX; -defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>, - VEX; +defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", + int_x86_sse41_pmovsxbw>, VEX; +defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", + int_x86_sse41_pmovsxwd>, VEX; +defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", + int_x86_sse41_pmovsxdq>, VEX; +defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", + int_x86_sse41_pmovzxbw>, VEX; +defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", + int_x86_sse41_pmovzxwd>, VEX; +defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", + int_x86_sse41_pmovzxdq>, VEX; } let Predicates = [HasAVX2] in { @@ -5537,12 +5493,12 @@ defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", int_x86_avx2_pmovzxdq>, VEX, VEX_L; } -defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>; -defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>; -defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>; -defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>; -defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>; -defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>; +defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw, SSE_INTALU_ITINS_P>; +defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd, SSE_INTALU_ITINS_P>; +defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq, SSE_INTALU_ITINS_P>; +defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw, SSE_INTALU_ITINS_P>; +defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd, SSE_INTALU_ITINS_P>; +defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq, SSE_INTALU_ITINS_P>; let Predicates = [HasAVX] in { // Common patterns involving scalar load. @@ -5640,32 +5596,39 @@ let Predicates = [HasAVX2] in { (VPMOVZXDQYrr VR128:$src)>; def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>; + def : Pat<(v16i16 (X86vzmovly (v16i8 VR128:$src))), + (VPMOVZXBWYrr VR128:$src)>; } def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; + def : Pat<(v16i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; } let Predicates = [HasAVX] in { def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; + def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>; } let Predicates = [UseSSE41] in { def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; + def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>; } -multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { +multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId, + OpndItins itins = DEFAULT_ITINS> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize; def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, + (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))], + itins.rm>, OpSize; } @@ -5704,10 +5667,14 @@ defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", int_x86_avx2_pmovzxwq>, VEX, VEX_L; } -defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>; -defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>; -defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>; -defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>; +defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd, + SSE_INTALU_ITINS_P>; +defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq, + SSE_INTALU_ITINS_P>; +defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd, + SSE_INTALU_ITINS_P>; +defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq, + SSE_INTALU_ITINS_P>; let Predicates = [HasAVX] in { // Common patterns involving scalar load @@ -5735,7 +5702,8 @@ let Predicates = [UseSSE41] in { (PMOVZXWQrm addr:$src)>; } -multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { +multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId, + OpndItins itins = DEFAULT_ITINS> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; @@ -5774,8 +5742,10 @@ defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq>, VEX, VEX_L; } -defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; -defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>; +defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq, + SSE_INTALU_ITINS_P>; +defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq, + SSE_INTALU_ITINS_P>; let Predicates = [HasAVX2] in { def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; @@ -6027,35 +5997,39 @@ let Predicates = [UseSSE41] in { /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { - def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), + imm:$src2))]>, OpSize; let neverHasSideEffects = 1, mayStore = 1 in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, OpSize; // FIXME: // There's an AssertZext in the way of writing the store pattern // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX] in defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; - def VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), - "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX; -} defm PEXTRB : SS41I_extract8<0x14, "pextrb">; /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { + let isCodeGenOnly = 1, hasSideEffects = 0 in + def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; + let neverHasSideEffects = 1, mayStore = 1 in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), @@ -6117,31 +6091,28 @@ defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory /// destination -multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { - def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), +multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, + OpndItins itins = DEFAULT_ITINS> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set GR32:$dst, - (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, + [(set GR32orGR64:$dst, + (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))], + itins.rr>, OpSize; def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), - addr:$dst)]>, OpSize; + addr:$dst)], itins.rm>, OpSize; } let ExeDomain = SSEPackedSingle in { - let Predicates = [HasAVX] in { + let Predicates = [UseAVX] in defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; - def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), - "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, OpSize, VEX; - } - defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; + defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>; } // Also match an EXTRACTPS store when the store is done as f32 instead of i32. @@ -6162,13 +6133,13 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize; + (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, OpSize; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), !if(Is2Addr, @@ -6241,7 +6212,8 @@ let Constraints = "$src1 = $dst" in // are optimized inserts that won't zero arbitrary elements in the destination // vector. The next one matches the intrinsic and could zero arbitrary elements // in the target vector. -multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { +multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, u32u8imm:$src3), !if(Is2Addr, @@ -6249,7 +6221,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>, + (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, OpSize; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3), @@ -6260,14 +6232,14 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { [(set VR128:$dst, (X86insrtps VR128:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), - imm:$src3))]>, OpSize; + imm:$src3))], itins.rm>, OpSize; } let ExeDomain = SSEPackedSingle in { - let Predicates = [HasAVX] in + let Predicates = [UseAVX] in defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; let Constraints = "$src1 = $dst" in - defm INSERTPS : SS41I_insertf32<0x21, "insertps">; + defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; } //===----------------------------------------------------------------------===// @@ -6285,7 +6257,8 @@ let ExeDomain = SSEPackedSingle in { (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>, + [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))], + IIC_SSE_ROUNDPS_REG>, OpSize; // Vector intrinsic operation, mem @@ -6294,7 +6267,8 @@ let ExeDomain = SSEPackedSingle in { !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>, + (V4F32Int (mem_frag32 addr:$src1),imm:$src2))], + IIC_SSE_ROUNDPS_MEM>, OpSize; } // ExeDomain = SSEPackedSingle @@ -6304,7 +6278,8 @@ let ExeDomain = SSEPackedDouble in { (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>, + [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))], + IIC_SSE_ROUNDPS_REG>, OpSize; // Vector intrinsic operation, mem @@ -6313,7 +6288,8 @@ let ExeDomain = SSEPackedDouble in { !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>, + (V2F64Int (mem_frag64 addr:$src1),imm:$src2))], + IIC_SSE_ROUNDPS_REG>, OpSize; } // ExeDomain = SSEPackedDouble } @@ -6397,11 +6373,11 @@ let ExeDomain = GenericDomain in { let Predicates = [HasAVX] in { // Intrinsic form defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, - memopv4f32, memopv2f64, + loadv4f32, loadv2f64, int_x86_sse41_round_ps, int_x86_sse41_round_pd>, VEX; defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, - memopv8f32, memopv4f64, + loadv8f32, loadv4f64, int_x86_avx_round_ps_256, int_x86_avx_round_pd_256>, VEX, VEX_L; defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", @@ -6539,7 +6515,7 @@ def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), OpSize, VEX; def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, + [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, OpSize, VEX; def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), @@ -6548,7 +6524,7 @@ def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), OpSize, VEX, VEX_L; def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>, + [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, OpSize, VEX, VEX_L; } @@ -6577,13 +6553,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, let Defs = [EFLAGS], Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { -defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>; -defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>, +defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>; +defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>, VEX_L; } let ExeDomain = SSEPackedDouble in { -defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>; -defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>, +defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>; +defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>, VEX_L; } } @@ -6595,30 +6571,33 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>, let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "popcnt{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, + [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)], + IIC_SSE_POPCNT_RR>, OpSize, XS; def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "popcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (ctpop (loadi16 addr:$src))), - (implicit EFLAGS)]>, OpSize, XS; + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, OpSize, XS; def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "popcnt{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, + [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)], + IIC_SSE_POPCNT_RR>, XS; def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "popcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (ctpop (loadi32 addr:$src))), - (implicit EFLAGS)]>, XS; + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS; def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "popcnt{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, + [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)], + IIC_SSE_POPCNT_RR>, XS; def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "popcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (ctpop (loadi64 addr:$src))), - (implicit EFLAGS)]>, XS; + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS; } @@ -6646,14 +6625,16 @@ defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, bit Is2Addr = 1> { + Intrinsic IntId128, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize; + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], + itins.rr>, OpSize; def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !if(Is2Addr, @@ -6661,7 +6642,8 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv2i64 addr:$src2))))]>, OpSize; + (bitconvert (memopv2i64 addr:$src2))))], + itins.rm>, OpSize; } /// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator @@ -6677,14 +6659,15 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntId256 VR256:$src1, - (bitconvert (memopv4i64 addr:$src2))))]>, OpSize; + (bitconvert (loadv4i64 addr:$src2))))]>, OpSize; } /// SS48I_binop_rm - Simple SSE41 binary operator. multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, bit Is2Addr = 1> { + X86MemOperand x86memop, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), @@ -6707,21 +6690,21 @@ let Predicates = [HasAVX] in { defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, 0>, VEX_4V; defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, 0>, VEX_4V; } @@ -6731,21 +6714,21 @@ let Predicates = [HasAVX2] in { defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", int_x86_avx2_packusdw>, VEX_4V, VEX_L; defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", int_x86_avx2_pmul_dq>, VEX_4V, VEX_L; } @@ -6754,22 +6737,23 @@ let Constraints = "$src1 = $dst" in { let isCommutable = 0 in defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>; defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMINUD : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128, - memopv2i64, i128mem>; - defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>; + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, + 1, SSE_INTMUL_ITINS_P>; } let Predicates = [HasAVX] in { @@ -6787,15 +6771,16 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>; defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>; } /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, bit Is2Addr = 1> { + X86MemOperand x86memop, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u32u8imm:$src3), @@ -6804,7 +6789,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, OpSize; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), @@ -6815,7 +6800,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, (IntId RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>, OpSize; } @@ -6823,40 +6808,40 @@ let Predicates = [HasAVX] in { let isCommutable = 0 in { let ExeDomain = SSEPackedSingle in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, memopv4f32, f128mem, 0>, VEX_4V; + VR128, loadv4f32, f128mem, 0>, VEX_4V; defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, memopv8f32, + int_x86_avx_blend_ps_256, VR256, loadv8f32, f256mem, 0>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, memopv2f64, f128mem, 0>, VEX_4V; + VR128, loadv2f64, f128mem, 0>, VEX_4V; defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256,VR256, memopv4f64, + int_x86_avx_blend_pd_256,VR256, loadv4f64, f256mem, 0>, VEX_4V, VEX_L; } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, - VR128, memopv2i64, i128mem, 0>, VEX_4V; + VR128, loadv2i64, i128mem, 0>, VEX_4V; defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv2i64, i128mem, 0>, VEX_4V; + VR128, loadv2i64, i128mem, 0>, VEX_4V; } let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, - VR128, memopv4f32, f128mem, 0>, VEX_4V; + VR128, loadv4f32, f128mem, 0>, VEX_4V; let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, - VR128, memopv2f64, f128mem, 0>, VEX_4V; + VR128, loadv2f64, f128mem, 0>, VEX_4V; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, - VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L; + VR256, loadv8f32, i256mem, 0>, VEX_4V, VEX_L; } let Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L; defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, - VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L; } } @@ -6864,21 +6849,27 @@ let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, - VR128, memopv4f32, f128mem>; + VR128, memopv4f32, f128mem, + 1, SSE_INTALU_ITINS_P>; let ExeDomain = SSEPackedDouble in defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, - VR128, memopv2f64, f128mem>; + VR128, memopv2f64, f128mem, + 1, SSE_INTALU_ITINS_P>; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, - VR128, memopv2i64, i128mem>; + VR128, memopv2i64, i128mem, + 1, SSE_INTALU_ITINS_P>; defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv2i64, i128mem>; + VR128, memopv2i64, i128mem, + 1, SSE_INTMUL_ITINS_P>; } let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, - VR128, memopv4f32, f128mem>; + VR128, memopv4f32, f128mem, 1, + SSE_DPPS_ITINS>; let ExeDomain = SSEPackedDouble in defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, - VR128, memopv2f64, f128mem>; + VR128, memopv2f64, f128mem, 1, + SSE_DPPD_ITINS>; } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators @@ -6905,23 +6896,23 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, - memopv2f64, int_x86_sse41_blendvpd>; + loadv2f64, int_x86_sse41_blendvpd>; defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, - memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L; + loadv4f64, int_x86_avx_blendv_pd_256>, VEX_L; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, - memopv4f32, int_x86_sse41_blendvps>; + loadv4f32, int_x86_sse41_blendvps>; defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, - memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L; + loadv8f32, int_x86_avx_blendv_ps_256>, VEX_L; } // ExeDomain = SSEPackedSingle defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - memopv2i64, int_x86_sse41_pblendvb>; + loadv2i64, int_x86_sse41_pblendvb>; } let Predicates = [HasAVX2] in { defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - memopv4i64, int_x86_avx2_pblendvb>, VEX_L; + loadv4i64, int_x86_avx2_pblendvb>, VEX_L; } let Predicates = [HasAVX] in { @@ -6974,7 +6965,7 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), (v32i8 VR256:$src2))), - (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>; + (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2), (imm:$mask))), (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; @@ -6983,13 +6974,14 @@ let Predicates = [HasAVX2] in { /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - X86MemOperand x86memop, Intrinsic IntId> { + X86MemOperand x86memop, Intrinsic IntId, + OpndItins itins = DEFAULT_ITINS> { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, - OpSize; + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], + itins.rr>, OpSize; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2), @@ -6997,7 +6989,8 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, - (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize; + (bitconvert (mem_frag addr:$src2)), XMM0))], + itins.rm>, OpSize; } } @@ -7011,17 +7004,17 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, int_x86_sse41_pblendvb>; // Aliases with the implicit xmm0 argument -def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; -def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; -def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}", +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; let Predicates = [UseSSE41] in { @@ -7094,11 +7087,11 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, - memopv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V; let Predicates = [HasAVX2] in defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, - memopv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, @@ -7258,70 +7251,100 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { // crc intrinsic instruction // This set of instructions are only rm, the only difference is the size // of r and m. +class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, + RegisterClass RCIn, SDPatternOperator Int> : + SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), + !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), + [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>; + +class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, + X86MemOperand x86memop, SDPatternOperator Int> : + SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), + !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), + [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))], + IIC_CRC32_MEM>; + let Constraints = "$src1 = $dst" in { - def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), - (ins GR32:$src1, i8mem:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", - [(set GR32:$dst, - (int_x86_sse42_crc32_32_8 GR32:$src1, - (load addr:$src2)))]>; - def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), - (ins GR32:$src1, GR8:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", - [(set GR32:$dst, - (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>; - def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), - (ins GR32:$src1, i16mem:$src2), - "crc32{w} \t{$src2, $src1|$src1, $src2}", - [(set GR32:$dst, - (int_x86_sse42_crc32_32_16 GR32:$src1, - (load addr:$src2)))]>, - OpSize; - def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), - (ins GR32:$src1, GR16:$src2), - "crc32{w} \t{$src2, $src1|$src1, $src2}", - [(set GR32:$dst, - (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>, - OpSize; - def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), - (ins GR32:$src1, i32mem:$src2), - "crc32{l} \t{$src2, $src1|$src1, $src2}", - [(set GR32:$dst, - (int_x86_sse42_crc32_32_32 GR32:$src1, - (load addr:$src2)))]>; - def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), - (ins GR32:$src1, GR32:$src2), - "crc32{l} \t{$src2, $src1|$src1, $src2}", - [(set GR32:$dst, - (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>; - def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), - (ins GR64:$src1, i8mem:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", - [(set GR64:$dst, - (int_x86_sse42_crc32_64_8 GR64:$src1, - (load addr:$src2)))]>, - REX_W; - def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), - (ins GR64:$src1, GR8:$src2), - "crc32{b} \t{$src2, $src1|$src1, $src2}", - [(set GR64:$dst, - (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>, - REX_W; - def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), - (ins GR64:$src1, i64mem:$src2), - "crc32{q} \t{$src2, $src1|$src1, $src2}", - [(set GR64:$dst, - (int_x86_sse42_crc32_64_64 GR64:$src1, - (load addr:$src2)))]>, - REX_W; - def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), - (ins GR64:$src1, GR64:$src2), - "crc32{q} \t{$src2, $src1|$src1, $src2}", - [(set GR64:$dst, - (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>, - REX_W; + def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, + int_x86_sse42_crc32_32_8>; + def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, + int_x86_sse42_crc32_32_8>; + def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, + int_x86_sse42_crc32_32_16>, OpSize; + def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, + int_x86_sse42_crc32_32_16>, OpSize; + def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, + int_x86_sse42_crc32_32_32>; + def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, + int_x86_sse42_crc32_32_32>; + def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, + int_x86_sse42_crc32_64_64>, REX_W; + def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, + int_x86_sse42_crc32_64_64>, REX_W; + let hasSideEffects = 0 in { + let mayLoad = 1 in + def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, + null_frag>, REX_W; + def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, + null_frag>, REX_W; + } +} + +//===----------------------------------------------------------------------===// +// SHA-NI Instructions +//===----------------------------------------------------------------------===// + +multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, + bit UsesXMM0 = 0> { + def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [!if(UsesXMM0, + (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), + (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8; + + def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [!if(UsesXMM0, + (set VR128:$dst, (IntId VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), + (set VR128:$dst, (IntId VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8; +} + +let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { + def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, + (i8 imm:$src3)))]>, TA; + def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_sha1rnds4 VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), + (i8 imm:$src3)))]>, TA; + + defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>; + defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>; + defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>; + + let Uses=[XMM0] in + defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>; + + defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>; + defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>; } +// Aliases with explicit %xmm0 +def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (SHA256RNDS2rr VR128:$dst, VR128:$src2)>; +def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>; + //===----------------------------------------------------------------------===// // AES-NI Instructions //===----------------------------------------------------------------------===// @@ -7378,7 +7401,7 @@ let Predicates = [HasAVX, HasAES] in { def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", - [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, + [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, OpSize, VEX; } def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), @@ -7405,7 +7428,7 @@ let Predicates = [HasAVX, HasAES] in { (ins i128mem:$src1, i8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, OpSize, VEX; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), @@ -7436,7 +7459,7 @@ def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, - (memopv2i64 addr:$src2), imm:$src3))]>; + (loadv2i64 addr:$src2), imm:$src3))]>; // Carry-less Multiplication instructions let Constraints = "$src1 = $dst" in { @@ -7444,13 +7467,15 @@ def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>; + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], + IIC_SSE_PCLMULQDQ_RR>; def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, i8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, - (memopv2i64 addr:$src2), imm:$src3))]>; + (memopv2i64 addr:$src2), imm:$src3))], + IIC_SSE_PCLMULQDQ_RM>; } // Constraints = "$src1 = $dst" @@ -7581,62 +7606,62 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX] in { -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), - (bc_v4i32 (memopv2i64 addr:$src2)), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (loadv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), - (bc_v16i8 (memopv2i64 addr:$src2)), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (loadv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), - (bc_v8i16 (memopv2i64 addr:$src2)), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (loadv2i64 addr:$src2)), (iPTR imm)), (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } //===----------------------------------------------------------------------===// @@ -7656,59 +7681,59 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), // AVX1 patterns let Predicates = [HasAVX] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4f32 (VEXTRACTF128rr (v8f32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2f64 (VEXTRACTF128rr (v4f64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1), - (iPTR imm))), addr:$dst), +def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1), + (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1), - (iPTR imm))), addr:$dst), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1), + (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTF128rr (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTF128rr (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTF128rr (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTF128rr (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), +def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), (iPTR imm))), addr:$dst), (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// @@ -7780,15 +7805,15 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, let ExeDomain = SSEPackedSingle in { defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, - memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>; + loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>; defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, - memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L; + loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, - memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>; + loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>; defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, - memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; + loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; } let Predicates = [HasAVX] in { @@ -7796,15 +7821,15 @@ def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)), +def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)), (i8 imm:$imm))), (VPERMILPSYmi addr:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDYmi addr:$src1, imm:$imm)>; def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), (VPERMILPDri VR128:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDmi addr:$src1, imm:$imm)>; } @@ -7820,7 +7845,7 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, i8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2), + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2), (i8 imm:$src3)))]>, VEX_4V, VEX_L; } @@ -7828,7 +7853,7 @@ let Predicates = [HasAVX] in { def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, - (memopv4f64 addr:$src2), (i8 imm:$imm))), + (loadv4f64 addr:$src2), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; } @@ -7843,16 +7868,16 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, - (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, - (memopv4i64 addr:$src2), (i8 imm:$imm))), + (loadv4i64 addr:$src2), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, - (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, - (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; } @@ -7896,7 +7921,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { TA, OpSize, VEX; } -let Predicates = [HasAVX, HasF16C] in { +let Predicates = [HasF16C] in { defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; @@ -7930,9 +7955,9 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, let isCommutable = 0 in { defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, - VR128, memopv2i64, i128mem>; + VR128, loadv2i64, i128mem>; defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, - VR256, memopv4i64, i256mem>, VEX_L; + VR256, loadv4i64, i256mem>, VEX_L; } def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), @@ -8110,9 +8135,9 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, VEX_4V, VEX_L; } -defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>; +defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>; let ExeDomain = SSEPackedSingle in -defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>; +defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>; multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, ValueType OpVT> { @@ -8132,9 +8157,9 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, (i8 imm:$src2))))]>, VEX, VEX_L; } -defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W; +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W; let ExeDomain = SSEPackedDouble in -defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W; +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W; //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks @@ -8147,7 +8172,7 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, i8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2), + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), (i8 imm:$src3)))]>, VEX_4V, VEX_L; let Predicates = [HasAVX2] in { @@ -8158,13 +8183,13 @@ def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)), +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, - (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; -def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; } @@ -8186,42 +8211,42 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX2] in { -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2), +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), - (bc_v4i32 (memopv2i64 addr:$src2)), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (loadv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), - (bc_v16i8 (memopv2i64 addr:$src2)), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (loadv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; -def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), - (bc_v8i16 (memopv2i64 addr:$src2)), + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (loadv2i64 addr:$src2)), (iPTR imm)), (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsertf128_imm VR256:$ins))>; + (INSERT_get_vinsert128_imm VR256:$ins))>; } //===----------------------------------------------------------------------===// @@ -8240,39 +8265,39 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), VEX, VEX_L; let Predicates = [HasAVX2] in { -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v2i64 (VEXTRACTI128rr (v4i64 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v4i32 (VEXTRACTI128rr (v8i32 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v8i16 (VEXTRACTI128rr (v16i16 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; -def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), (v16i8 (VEXTRACTI128rr (v32i8 VR256:$src1), - (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1), - (iPTR imm))), addr:$dst), +def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1), - (iPTR imm))), addr:$dst), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1), - (iPTR imm))), addr:$dst), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; -def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1), - (iPTR imm))), addr:$dst), + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextractf128_imm VR128:$ext))>; + (EXTRACT_get_vextract128_imm VR128:$ext))>; } //===----------------------------------------------------------------------===// @@ -8328,7 +8353,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, - (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>, + (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, VEX_4V; def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), @@ -8341,7 +8366,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, - (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>, + (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, VEX_4V, VEX_L; } @@ -8367,7 +8392,9 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, []>, VEX_4VOp3, VEX_L; } -let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in { +let mayLoad = 1, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td index 757dcd0..0191c01 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSVM.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td @@ -26,37 +26,37 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; // 0F 01 DE let Uses = [EAX] in -def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB; +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB; // 0F 01 D8 let Uses = [EAX] in def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), - "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmrun\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), - "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DA let Uses = [EAX] in def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), - "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmload\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), - "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DB let Uses = [EAX] in def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), - "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>; + "vmsave\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX] in def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), - "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>; + "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; // 0F 01 DF let Uses = [EAX, ECX] in def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), - "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>; + "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[In32BitMode]>; let Uses = [RAX, ECX] in def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), - "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>; + "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index 89c1a68..1937770 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -18,16 +18,16 @@ let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1), - "shl{b}\t{%cl, $dst|$dst, CL}", + "shl{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>; def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1), - "shl{w}\t{%cl, $dst|$dst, CL}", + "shl{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize; def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1), - "shl{l}\t{%cl, $dst|$dst, CL}", + "shl{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>; def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), - "shl{q}\t{%cl, $dst|$dst, CL}", + "shl{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; } // Uses = [CL] @@ -70,17 +70,17 @@ let SchedRW = [WriteShiftLd, WriteRMW] in { // using CL? let Uses = [CL] in { def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst), - "shl{b}\t{%cl, $dst|$dst, CL}", + "shl{b}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst), - "shl{w}\t{%cl, $dst|$dst, CL}", + "shl{w}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), - "shl{l}\t{%cl, $dst|$dst, CL}", + "shl{l}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), - "shl{q}\t{%cl, $dst|$dst, CL}", + "shl{q}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), @@ -124,16 +124,16 @@ def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1), - "shr{b}\t{%cl, $dst|$dst, CL}", + "shr{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>; def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1), - "shr{w}\t{%cl, $dst|$dst, CL}", + "shr{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize; def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1), - "shr{l}\t{%cl, $dst|$dst, CL}", + "shr{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>; def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), - "shr{q}\t{%cl, $dst|$dst, CL}", + "shr{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; } @@ -171,17 +171,17 @@ def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst), - "shr{b}\t{%cl, $dst|$dst, CL}", + "shr{b}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst), - "shr{w}\t{%cl, $dst|$dst, CL}", + "shr{w}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), - "shr{l}\t{%cl, $dst|$dst, CL}", + "shr{l}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), - "shr{q}\t{%cl, $dst|$dst, CL}", + "shr{q}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), @@ -224,19 +224,19 @@ def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), - "sar{b}\t{%cl, $dst|$dst, CL}", + "sar{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (sra GR8:$src1, CL))], IIC_SR>; def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1), - "sar{w}\t{%cl, $dst|$dst, CL}", + "sar{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (sra GR16:$src1, CL))], IIC_SR>, OpSize; def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1), - "sar{l}\t{%cl, $dst|$dst, CL}", + "sar{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (sra GR32:$src1, CL))], IIC_SR>; def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), - "sar{q}\t{%cl, $dst|$dst, CL}", + "sar{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (sra GR64:$src1, CL))], IIC_SR>; } @@ -283,19 +283,19 @@ def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst), - "sar{b}\t{%cl, $dst|$dst, CL}", + "sar{b}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), - "sar{w}\t{%cl, $dst|$dst, CL}", + "sar{w}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), - "sar{l}\t{%cl, $dst|$dst, CL}", + "sar{l}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), - "sar{q}\t{%cl, $dst|$dst, CL}", + "sar{q}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -349,7 +349,7 @@ def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), - "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize; @@ -357,7 +357,7 @@ def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; let Uses = [CL] in def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), - "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), "rcl{l}\t$dst", [], IIC_SR>; @@ -365,7 +365,7 @@ def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), - "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), @@ -374,7 +374,7 @@ def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), - "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), @@ -383,7 +383,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), - "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize; @@ -391,7 +391,7 @@ def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize; let Uses = [CL] in def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), - "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t$dst", [], IIC_SR>; @@ -399,7 +399,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), - "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; @@ -407,7 +407,7 @@ def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), - "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; } // Constraints = "$src = $dst" @@ -448,22 +448,22 @@ def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), let Uses = [CL] in { def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), - "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), - "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), - "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), - "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), - "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), - "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize; + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize; def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), - "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), - "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>; + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; } } // SchedRW } // hasSideEffects = 0 @@ -472,16 +472,16 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { // FIXME: provide shorter instructions when imm8 == 1 let Uses = [CL] in { def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), - "rol{b}\t{%cl, $dst|$dst, CL}", + "rol{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>; def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1), - "rol{w}\t{%cl, $dst|$dst, CL}", + "rol{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize; def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1), - "rol{l}\t{%cl, $dst|$dst, CL}", + "rol{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>; def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), - "rol{q}\t{%cl, $dst|$dst, CL}", + "rol{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; } @@ -525,19 +525,19 @@ def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst), - "rol{b}\t{%cl, $dst|$dst, CL}", + "rol{b}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst), - "rol{w}\t{%cl, $dst|$dst, CL}", + "rol{w}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), - "rol{l}\t{%cl, $dst|$dst, CL}", + "rol{l}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), - "rol{q}\t{%cl, $dst|$dst, %cl}", + "rol{q}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -582,16 +582,16 @@ def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), - "ror{b}\t{%cl, $dst|$dst, CL}", + "ror{b}\t{%cl, $dst|$dst, cl}", [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>; def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1), - "ror{w}\t{%cl, $dst|$dst, CL}", + "ror{w}\t{%cl, $dst|$dst, cl}", [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize; def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1), - "ror{l}\t{%cl, $dst|$dst, CL}", + "ror{l}\t{%cl, $dst|$dst, cl}", [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>; def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), - "ror{q}\t{%cl, $dst|$dst, CL}", + "ror{q}\t{%cl, $dst|$dst, cl}", [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; } @@ -635,19 +635,19 @@ def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst), - "ror{b}\t{%cl, $dst|$dst, CL}", + "ror{b}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), - "ror{w}\t{%cl, $dst|$dst, CL}", + "ror{w}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize; def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), - "ror{l}\t{%cl, $dst|$dst, CL}", + "ror{l}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>; def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), - "ror{q}\t{%cl, $dst|$dst, CL}", + "ror{q}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } @@ -699,35 +699,35 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), - "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize; def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), - "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize; def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))], IIC_SHD32_REG_CL>, TB; def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), - "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))], IIC_SHD32_REG_CL>, TB; def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))], IIC_SHD64_REG_CL>, TB; def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), - "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))], IIC_SHD64_REG_CL>, TB; @@ -782,29 +782,29 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg, let SchedRW = [WriteShiftLd, WriteRMW] in { let Uses = [CL] in { def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize; def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)], IIC_SHD32_MEM_CL>, TB; def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)], IIC_SHD32_MEM_CL>, TB; def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), addr:$dst)], IIC_SHD64_MEM_CL>, TB; def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}", + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), addr:$dst)], IIC_SHD64_MEM_CL>, TB; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td index bab3cdd..2196dc3 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -77,43 +77,43 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, let SchedRW = [WriteSystem] in { let Defs = [AL], Uses = [DX] in def IN8rr : I<0xEC, RawFrm, (outs), (ins), - "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>; + "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>; let Defs = [AX], Uses = [DX] in def IN16rr : I<0xED, RawFrm, (outs), (ins), - "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>, OpSize; + "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize; let Defs = [EAX], Uses = [DX] in def IN32rr : I<0xED, RawFrm, (outs), (ins), - "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>; + "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>; let Defs = [AL] in def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), - "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>; + "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>; let Defs = [AX] in def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), - "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize; + "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize; let Defs = [EAX] in def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), - "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>; + "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>; let Uses = [DX, AL] in def OUT8rr : I<0xEE, RawFrm, (outs), (ins), - "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>; + "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>; let Uses = [DX, AX] in def OUT16rr : I<0xEF, RawFrm, (outs), (ins), - "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize; + "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize; let Uses = [DX, EAX] in def OUT32rr : I<0xEF, RawFrm, (outs), (ins), - "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>; + "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>; let Uses = [AL] in def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), - "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>; + "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>; let Uses = [AX] in def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), - "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize; + "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize; let Uses = [EAX] in def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), - "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>; + "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>; def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>; def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>, OpSize; @@ -248,75 +248,75 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), - "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), - "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>; + "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>; def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), - "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), - "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), - "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), - "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHES16 : I<0x06, RawFrm, (outs), (ins), - "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, + "push{w}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>, OpSize; def PUSHES32 : I<0x06, RawFrm, (outs), (ins), - "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; + "push{l}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>; def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), - "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB; + "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize, TB; def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), - "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; + "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), - "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB; + "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize, TB; def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), - "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; + "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>; def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), - "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB; + "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB; def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), - "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB; + "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB; // No "pop cs" instruction. def POPSS16 : I<0x17, RawFrm, (outs), (ins), - "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>, + "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>, OpSize, Requires<[In32BitMode]>; def POPSS32 : I<0x17, RawFrm, (outs), (ins), - "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>, + "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>, Requires<[In32BitMode]>; def POPDS16 : I<0x1F, RawFrm, (outs), (ins), - "pop{w}\t{%ds|DS}", [], IIC_POP_SR>, + "pop{w}\t{%ds|ds}", [], IIC_POP_SR>, OpSize, Requires<[In32BitMode]>; def POPDS32 : I<0x1F, RawFrm, (outs), (ins), - "pop{l}\t{%ds|DS}", [], IIC_POP_SR>, + "pop{l}\t{%ds|ds}", [], IIC_POP_SR>, Requires<[In32BitMode]>; def POPES16 : I<0x07, RawFrm, (outs), (ins), - "pop{w}\t{%es|ES}", [], IIC_POP_SR>, + "pop{w}\t{%es|es}", [], IIC_POP_SR>, OpSize, Requires<[In32BitMode]>; def POPES32 : I<0x07, RawFrm, (outs), (ins), - "pop{l}\t{%es|ES}", [], IIC_POP_SR>, + "pop{l}\t{%es|es}", [], IIC_POP_SR>, Requires<[In32BitMode]>; def POPFS16 : I<0xa1, RawFrm, (outs), (ins), - "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB; + "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize, TB; def POPFS32 : I<0xa1, RawFrm, (outs), (ins), - "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; + "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; def POPFS64 : I<0xa1, RawFrm, (outs), (ins), - "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB; + "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB; def POPGS16 : I<0xa9, RawFrm, (outs), (ins), - "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB; + "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize, TB; def POPGS32 : I<0xa9, RawFrm, (outs), (ins), - "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; + "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>; def POPGS64 : I<0xa9, RawFrm, (outs), (ins), - "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB; + "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB; def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td index 363a190..59a6f1e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrTSX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td @@ -37,3 +37,10 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins), def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), "xabort\t$imm", [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; + +// HLE prefixes + +def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>; + +def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td index 2aa08fa..2b6ee5c 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -20,23 +20,21 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; } -let isAsmParserOnly = 1 in { - defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>; - defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>; - defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>; - defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>; - defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>; - defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>; - defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>; - defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>; - defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>; - defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>; - defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>; - defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>; - defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>; - defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>; - defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>; -} +defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>; +defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>; +defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>; +defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>; +defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>; +defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>; +defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>; +defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>; +defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>; +defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>; +defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>; +defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>; +defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>; +defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>; +defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>; // Scalar load 2 addr operand instructions multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, @@ -49,12 +47,10 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX; } -let isAsmParserOnly = 1 in { - defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, - ssmem, sse_load_f32>; - defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, - sdmem, sse_load_f64>; -} +defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32>; +defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64>; multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { @@ -66,10 +62,8 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX; } -let isAsmParserOnly = 1 in { - defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>; - defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>; -} +defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>; +defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>; multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { @@ -81,12 +75,8 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L; } -let isAsmParserOnly = 1 in { - defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, - memopv8f32>; - defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, - memopv4f64>; -} +defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>; +defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, memopv4f64>; multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), @@ -107,20 +97,18 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { VEX_4VOp3; } -let isAsmParserOnly = 1 in { - defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; - defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; - defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; - defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; - defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; - defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; - defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; - defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; - defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; - defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; - defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; - defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; -} +defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; +defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; +defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; +defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; +defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; +defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; +defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; +defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; +defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; +defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; +defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; +defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), @@ -134,12 +122,10 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX; } -let isAsmParserOnly = 1 in { - defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; - defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; - defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; - defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; -} +defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; +defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; +defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; +defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; // Instruction where second source can be memory, but third must be register multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { @@ -158,20 +144,18 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { VR128:$src3))]>, VEX_4V, VEX_I8IMM; } -let isAsmParserOnly = 1 in { - defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; - defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; - defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; - defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; - defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; - defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; - defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; - defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; - defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; - defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; - defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; - defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; -} +defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; +defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; +defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; +defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; +defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; +defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; +defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; +defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; +defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; +defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; +defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; +defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; // Instruction where second source can be memory, third must be imm8 multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { @@ -190,16 +174,14 @@ multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { imm:$src3))]>, VEX_4V; } -let isAsmParserOnly = 1 in { - defm VPCOMB : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>; - defm VPCOMW : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>; - defm VPCOMD : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>; - defm VPCOMQ : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>; - defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>; - defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>; - defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>; - defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>; -} +defm VPCOMB : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>; +defm VPCOMW : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>; +defm VPCOMD : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>; +defm VPCOMQ : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>; +defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>; +defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>; +defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>; +defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>; // Instruction where either second or third source can be memory multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { @@ -227,10 +209,8 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { VEX_4V, VEX_I8IMM; } -let isAsmParserOnly = 1 in { - defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; - defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; -} +defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; +defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), @@ -257,9 +237,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { VEX_4V, VEX_I8IMM, VEX_L; } -let isAsmParserOnly = 1 in { - defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; -} +defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp index 44d8cce..e99f2d9 100644 --- a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp @@ -127,7 +127,7 @@ extern "C" { "movaps %xmm6, 96(%rsp)\n" "movaps %xmm7, 112(%rsp)\n" // JIT callee -#ifdef _WIN64 +#if defined(_WIN64) || defined(__CYGWIN__) "subq $32, %rsp\n" "movq %rbp, %rcx\n" // Pass prev frame and return address "movq 8(%rbp), %rdx\n" @@ -339,6 +339,7 @@ extern "C" { /// must locate the start of the stub or call site and pass it into the JIT /// compiler function. extern "C" { +LLVM_ATTRIBUTE_USED // Referenced from inline asm. LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) { intptr_t *RetAddrLoc = &StackPtr[1]; diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index a8a9fd8..6649c82 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -17,6 +17,7 @@ #include "X86COFFMachineModuleInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -34,14 +35,12 @@ namespace { /// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst. class X86MCInstLower { MCContext &Ctx; - Mangler *Mang; const MachineFunction &MF; const TargetMachine &TM; const MCAsmInfo &MAI; X86AsmPrinter &AsmPrinter; public: - X86MCInstLower(Mangler *mang, const MachineFunction &MF, - X86AsmPrinter &asmprinter); + X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter); void Lower(const MachineInstr *MI, MCInst &OutMI) const; @@ -50,13 +49,16 @@ public: private: MachineModuleInfoMachO &getMachOMMI() const; + Mangler *getMang() const { + return AsmPrinter.Mang; + } }; } // end anonymous namespace -X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf, +X86MCInstLower::X86MCInstLower(const MachineFunction &mf, X86AsmPrinter &asmprinter) -: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()), +: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {} MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { @@ -81,7 +83,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE) isImplicitlyPrivate = true; - Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate); + getMang()->getNameWithPrefix(Name, GV, isImplicitlyPrivate); } else if (MO.isSymbol()) { Name += MAI.getGlobalPrefix(); Name += MO.getSymbolName(); @@ -110,7 +112,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { assert(MO.isGlobal() && "Extern symbol not handled yet"); StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(MO.getGlobal()), + StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()), !MO.getGlobal()->hasInternalLinkage()); } return Sym; @@ -124,7 +126,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { assert(MO.isGlobal() && "Extern symbol not handled yet"); StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(MO.getGlobal()), + StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()), !MO.getGlobal()->hasInternalLinkage()); } return Sym; @@ -140,7 +142,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { if (MO.isGlobal()) { StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(MO.getGlobal()), + StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()), !MO.getGlobal()->hasInternalLinkage()); } else { Name.erase(Name.end()-5, Name.end()); @@ -225,32 +227,6 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, } - -static void lower_subreg32(MCInst *MI, unsigned OpNo) { - // Convert registers in the addr mode according to subreg32. - unsigned Reg = MI->getOperand(OpNo).getReg(); - if (Reg != 0) - MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32)); -} - -static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) { - // Convert registers in the addr mode according to subreg64. - for (unsigned i = 0; i != 4; ++i) { - if (!MI->getOperand(OpNo+i).isReg()) continue; - - unsigned Reg = MI->getOperand(OpNo+i).getReg(); - // LEAs can use RIP-relative addressing, and RIP has no sub/super register. - if (Reg == 0 || Reg == X86::RIP) continue; - - MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64)); - } -} - -/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8. -static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) { - OutMI.setOpcode(NewOpc); - lower_subreg32(&OutMI, 0); -} /// LowerUnaryToTwoAddr - R = setb -> R = sbb R, R static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) { OutMI.setOpcode(NewOpc); @@ -280,6 +256,34 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) { Inst.addOperand(Saved); } +/// \brief If a movsx instruction has a shorter encoding for the used register +/// simplify the instruction to use it instead. +static void SimplifyMOVSX(MCInst &Inst) { + unsigned NewOpcode = 0; + unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg(); + switch (Inst.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw + if (Op0 == X86::AX && Op1 == X86::AL) + NewOpcode = X86::CBW; + break; + case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl + if (Op0 == X86::EAX && Op1 == X86::AX) + NewOpcode = X86::CWDE; + break; + case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq + if (Op0 == X86::RAX && Op1 == X86::EAX) + NewOpcode = X86::CDQE; + break; + } + + if (NewOpcode != 0) { + Inst = MCInst(); + Inst.setOpcode(NewOpcode); + } +} + /// \brief Simplify things like MOV32rm to MOV32o32a. static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst, unsigned Opcode) { @@ -376,9 +380,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { // Handle a few special cases to eliminate operand modifiers. ReSimplify: switch (OutMI.getOpcode()) { - case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand. - lower_lea64_32mem(&OutMI, 1); - // FALL THROUGH. + case X86::LEA64_32r: case X86::LEA64r: case X86::LEA16r: case X86::LEA32r: @@ -388,23 +390,10 @@ ReSimplify: assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 && "LEA has segment specified!"); break; - case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break; - case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break; - case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break; - case X86::MOVZX64rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break; - case X86::MOVZX64rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break; - case X86::MOVZX64rr16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break; - case X86::MOVZX64rm16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break; - case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; - case X86::MOV16r0: - LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 - LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr - break; - case X86::MOV64r0: - LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV64r0 -> MOV32r0 - LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr + case X86::MOV32ri64: + OutMI.setOpcode(X86::MOV32ri); break; // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B @@ -598,16 +587,11 @@ ReSimplify: case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break; case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break; - case X86::MORESTACK_RET: - OutMI.setOpcode(X86::RET); - break; - - case X86::MORESTACK_RET_RESTORE_R10: - OutMI.setOpcode(X86::MOV64rr); - OutMI.addOperand(MCOperand::CreateReg(X86::R10)); - OutMI.addOperand(MCOperand::CreateReg(X86::RAX)); - - AsmPrinter.OutStreamer.EmitInstruction(MCInstBuilder(X86::RET)); + // Try to shrink some forms of movsx. + case X86::MOVSX16rr8: + case X86::MOVSX32rr16: + case X86::MOVSX64rr32: + SimplifyMOVSX(OutMI); break; } } @@ -691,17 +675,143 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, .addExpr(tlsRef)); } +static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator> +parseMemoryOperand(StackMaps::Location::LocationType LocTy, unsigned Size, + MachineInstr::const_mop_iterator MOI, + MachineInstr::const_mop_iterator MOE) { + + typedef StackMaps::Location Location; + + assert(std::distance(MOI, MOE) >= 5 && "Too few operands to encode mem op."); + + const MachineOperand &Base = *MOI; + const MachineOperand &Scale = *(++MOI); + const MachineOperand &Index = *(++MOI); + const MachineOperand &Disp = *(++MOI); + const MachineOperand &ZeroReg = *(++MOI); + + // Sanity check for supported operand format. + assert(Base.isReg() && + Scale.isImm() && Scale.getImm() == 1 && + Index.isReg() && Index.getReg() == 0 && + Disp.isImm() && ZeroReg.isReg() && (ZeroReg.getReg() == 0) && + "Unsupported x86 memory operand sequence."); + (void)Scale; + (void)Index; + (void)ZeroReg; + + return std::make_pair( + Location(LocTy, Size, Base.getReg(), Disp.getImm()), ++MOI); +} + +std::pair<StackMaps::Location, MachineInstr::const_mop_iterator> +X86AsmPrinter::stackmapOperandParser(MachineInstr::const_mop_iterator MOI, + MachineInstr::const_mop_iterator MOE, + const TargetMachine &TM) { + + typedef StackMaps::Location Location; + + const MachineOperand &MOP = *MOI; + assert(!MOP.isRegMask() && (!MOP.isReg() || !MOP.isImplicit()) && + "Register mask and implicit operands should not be processed."); + + if (MOP.isImm()) { + // Verify anyregcc + // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ... + + switch (MOP.getImm()) { + default: llvm_unreachable("Unrecognized operand type."); + case StackMaps::DirectMemRefOp: { + unsigned Size = TM.getDataLayout()->getPointerSizeInBits(); + assert((Size % 8) == 0 && "Need pointer size in bytes."); + Size /= 8; + return parseMemoryOperand(StackMaps::Location::Direct, Size, + llvm::next(MOI), MOE); + } + case StackMaps::IndirectMemRefOp: { + ++MOI; + int64_t Size = MOI->getImm(); + assert(Size > 0 && "Need a valid size for indirect memory locations."); + return parseMemoryOperand(StackMaps::Location::Indirect, Size, + llvm::next(MOI), MOE); + } + case StackMaps::ConstantOp: { + ++MOI; + assert(MOI->isImm() && "Expected constant operand."); + int64_t Imm = MOI->getImm(); + return std::make_pair( + Location(Location::Constant, sizeof(int64_t), 0, Imm), ++MOI); + } + } + } + + // Otherwise this is a reg operand. The physical register number will + // ultimately be encoded as a DWARF regno. The stack map also records the size + // of a spill slot that can hold the register content. (The runtime can + // track the actual size of the data type if it needs to.) + assert(MOP.isReg() && "Expected register operand here."); + assert(TargetRegisterInfo::isPhysicalRegister(MOP.getReg()) && + "Virtreg operands should have been rewritten before now."); + const TargetRegisterClass *RC = + TM.getRegisterInfo()->getMinimalPhysRegClass(MOP.getReg()); + assert(!MOP.getSubReg() && "Physical subreg still around."); + return std::make_pair( + Location(Location::Register, RC->getSize(), MOP.getReg(), 0), ++MOI); +} + +// Lower a stackmap of the form: +// <id>, <shadowBytes>, ... +static void LowerSTACKMAP(MCStreamer &OutStreamer, + StackMaps &SM, + const MachineInstr &MI) +{ + unsigned NumNOPBytes = MI.getOperand(1).getImm(); + SM.recordStackMap(MI); + // Emit padding. + // FIXME: These nops ensure that the stackmap's shadow is covered by + // instructions from the same basic block, but the nops should not be + // necessary if instructions from the same block follow the stackmap. + for (unsigned i = 0; i < NumNOPBytes; ++i) + OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP)); +} + +// Lower a patchpoint of the form: +// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ... +static void LowerPATCHPOINT(MCStreamer &OutStreamer, + StackMaps &SM, + const MachineInstr &MI) { + SM.recordPatchPoint(MI); + + PatchPointOpers opers(&MI); + unsigned ScratchIdx = opers.getNextScratchIdx(); + unsigned EncodedBytes = 0; + int64_t CallTarget = opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); + if (CallTarget) { + // Emit MOV to materialize the target address and the CALL to target. + // This is encoded with 12-13 bytes, depending on which register is used. + // We conservatively assume that it is 12 bytes and emit in worst case one + // extra NOP byte. + EncodedBytes = 12; + OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64ri) + .addReg(MI.getOperand(ScratchIdx).getReg()) + .addImm(CallTarget)); + OutStreamer.EmitInstruction(MCInstBuilder(X86::CALL64r) + .addReg(MI.getOperand(ScratchIdx).getReg())); + } + // Emit padding. + unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); + assert(NumBytes >= EncodedBytes && + "Patchpoint can't request size less than the length of a call."); + + for (unsigned i = EncodedBytes; i < NumBytes; ++i) + OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP)); +} + void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { - X86MCInstLower MCInstLowering(Mang, *MF, *this); + X86MCInstLower MCInstLowering(*MF, *this); switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: - if (isVerbose() && OutStreamer.hasRawTextSupport()) { - std::string TmpStr; - raw_string_ostream OS(TmpStr); - PrintDebugValueComment(MI, OS); - OutStreamer.EmitRawText(StringRef(OS.str())); - } - return; + llvm_unreachable("Should be handled target independently"); // Emit nothing here but a comment if we can. case X86::Int_MemBarrier: @@ -786,6 +896,24 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { .addExpr(DotExpr)); return; } + + case TargetOpcode::STACKMAP: + return LowerSTACKMAP(OutStreamer, SM, *MI); + + case TargetOpcode::PATCHPOINT: + return LowerPATCHPOINT(OutStreamer, SM, *MI); + + case X86::MORESTACK_RET: + OutStreamer.EmitInstruction(MCInstBuilder(X86::RET)); + return; + + case X86::MORESTACK_RET_RESTORE_R10: + // Return, then restore R10. + OutStreamer.EmitInstruction(MCInstBuilder(X86::RET)); + OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64rr) + .addReg(X86::R10) + .addReg(X86::RAX)); + return; } MCInst TmpInst; diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index 16886e4..dbda556 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -54,15 +54,14 @@ static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); -X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, - const TargetInstrInfo &tii) +X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm) : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit() ? X86::RIP : X86::EIP), X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false), X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true), (tm.getSubtarget<X86Subtarget>().is64Bit() ? X86::RIP : X86::EIP)), - TM(tm), TII(tii) { + TM(tm) { X86_MC::InitLLVM2SEHRegisterMapping(this); // Cache some information. @@ -102,8 +101,8 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const { bool X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { - // Only enable when post-RA scheduling is enabled and this is needed. - return TM.getSubtargetImpl()->postRAScheduler(); + // ExeDepsFixer and PostRAScheduler require liveness. + return true; } int @@ -240,8 +239,18 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { case CallingConv::HiPE: return CSR_NoRegs_SaveList; + case CallingConv::WebKit_JS: + return CSR_64_SaveList; + case CallingConv::AnyReg: + return CSR_MostRegs_64_SaveList; + case CallingConv::Intel_OCL_BI: { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); + if (HasAVX512 && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; + if (HasAVX512 && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX512_SaveList; if (HasAVX && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX_SaveList; if (HasAVX && Is64Bit) @@ -276,8 +285,13 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512(); if (CC == CallingConv::Intel_OCL_BI) { + if (IsWin64 && HasAVX512) + return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; + if (Is64Bit && HasAVX512) + return CSR_64_Intel_OCL_BI_AVX512_RegMask; if (IsWin64 && HasAVX) return CSR_Win64_Intel_OCL_BI_AVX_RegMask; if (Is64Bit && HasAVX) @@ -287,6 +301,8 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { } if (CC == CallingConv::GHC || CC == CallingConv::HiPE) return CSR_NoRegs_RegMask; + if (CC == CallingConv::WebKit_JS || CC == CallingConv::AnyReg) + return CSR_MostRegs_64_RegMask; if (!Is64Bit) return CSR_32_RegMask; if (CC == CallingConv::Cold) @@ -306,19 +322,19 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); // Set the stack-pointer register and its aliases as reserved. - Reserved.set(X86::RSP); - for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I) + for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) Reserved.set(*I); // Set the instruction pointer register and its aliases as reserved. - Reserved.set(X86::RIP); - for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I) + for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) Reserved.set(*I); // Set the frame-pointer register and its aliases as reserved if needed. if (TFI->hasFP(MF)) { - Reserved.set(X86::RBP); - for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I) + for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) Reserved.set(*I); } @@ -331,8 +347,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - Reserved.set(getBaseRegister()); - for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I) + for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true); + I.isValid(); ++I) Reserved.set(*I); } @@ -345,14 +361,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(X86::GS); // Mark the floating point stack registers as reserved. - Reserved.set(X86::ST0); - Reserved.set(X86::ST1); - Reserved.set(X86::ST2); - Reserved.set(X86::ST3); - Reserved.set(X86::ST4); - Reserved.set(X86::ST5); - Reserved.set(X86::ST6); - Reserved.set(X86::ST7); + for (unsigned n = 0; n != 8; ++n) + Reserved.set(X86::ST0 + n); // Reserve the registers that only exist in 64-bit mode. if (!Is64Bit) { @@ -365,19 +375,20 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (unsigned n = 0; n != 8; ++n) { // R8, R9, ... - static const uint16_t GPR64[] = { - X86::R8, X86::R9, X86::R10, X86::R11, - X86::R12, X86::R13, X86::R14, X86::R15 - }; - for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI) + for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); // XMM8, XMM9, ... - assert(X86::XMM15 == X86::XMM8+7); for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); } } + if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) { + for (unsigned n = 16; n != 32; ++n) { + for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + } + } return Reserved; } @@ -407,10 +418,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { + if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + return false; + const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); - if (!MF.getTarget().Options.RealignStack) - return false; // Stack realignment requires a frame pointer. If we already started // register allocation with frame pointer elimination, it is too late now. @@ -507,14 +519,6 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { return TFI->hasFP(MF) ? FramePtr : StackPtr; } -unsigned X86RegisterInfo::getEHExceptionRegister() const { - llvm_unreachable("What is the exception register"); -} - -unsigned X86RegisterInfo::getEHHandlerRegister() const { - llvm_unreachable("What is the exception handler register"); -} - namespace llvm { unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, bool High) { @@ -688,4 +692,15 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } } } + +unsigned get512BitSuperRegister(unsigned Reg) { + if (Reg >= X86::XMM0 && Reg <= X86::XMM31) + return X86::ZMM0 + (Reg - X86::XMM0); + if (Reg >= X86::YMM0 && Reg <= X86::YMM31) + return X86::ZMM0 + (Reg - X86::YMM0); + if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31) + return Reg; + llvm_unreachable("Unexpected SIMD register"); +} + } diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index b9d7b8c..22251b2 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -27,7 +27,6 @@ namespace llvm { class X86RegisterInfo : public X86GenRegisterInfo { public: X86TargetMachine &TM; - const TargetInstrInfo &TII; private: /// Is64Bit - Is the target 64-bits. @@ -56,7 +55,7 @@ private: unsigned BasePtr; public: - X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii); + X86RegisterInfo(X86TargetMachine &tm); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; @@ -127,10 +126,6 @@ public: unsigned getBaseRegister() const { return BasePtr; } // FIXME: Move to FrameInfok unsigned getSlotSize() const { return SlotSize; } - - // Exception handling queries. - unsigned getEHExceptionRegister() const; - unsigned getEHHandlerRegister() const; }; // getX86SubSuperRegister - X86 utility function. It returns the sub or super @@ -138,6 +133,9 @@ public: // e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); +//get512BitRegister - X86 utility - returns 512-bit super register +unsigned get512BitSuperRegister(unsigned Reg); + } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td index be6282a..b802728 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -21,11 +21,12 @@ class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> // Subregister indices. let Namespace = "X86" in { - def sub_8bit : SubRegIndex; - def sub_8bit_hi : SubRegIndex; - def sub_16bit : SubRegIndex; - def sub_32bit : SubRegIndex; - def sub_xmm : SubRegIndex; + def sub_8bit : SubRegIndex<8>; + def sub_8bit_hi : SubRegIndex<8, 8>; + def sub_16bit : SubRegIndex<16>; + def sub_32bit : SubRegIndex<32>; + def sub_xmm : SubRegIndex<128>; + def sub_ymm : SubRegIndex<256>; } //===----------------------------------------------------------------------===// @@ -186,28 +187,53 @@ def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; + +def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>; +def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>; +def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>; +def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>; +def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>; +def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>; +def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>; +def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>; +def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>; +def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>; +def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>; +def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>; +def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>; +def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>; +def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>; +def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>; + } // CostPerUse -// YMM Registers, used by AVX instructions +// YMM0-15 registers, used by AVX instructions and +// YMM16-31 registers, used by AVX-512 instructions. let SubRegIndices = [sub_xmm] in { -def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias<XMM0>; -def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias<XMM1>; -def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias<XMM2>; -def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias<XMM3>; -def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias<XMM4>; -def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias<XMM5>; -def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias<XMM6>; -def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias<XMM7>; -def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias<XMM8>; -def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias<XMM9>; -def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>; -def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>; -def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>; -def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>; -def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>; -def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>; + foreach Index = 0-31 in { + def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } +} + +// ZMM Registers, used by AVX-512 instructions. +let SubRegIndices = [sub_ymm] in { + foreach Index = 0-31 in { + def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } } + // Mask Registers, used by AVX-512 instructions. + def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; + def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; + def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; + def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; + def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; + def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; + def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; + def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; + class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> { let Aliases = A; } @@ -421,3 +447,25 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } + +// AVX-512 vector/mask registers. +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512, + (sequence "ZMM%u", 0, 31)>; + +// Scalar AVX-512 floating point registers. +def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; + +def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; + +// Extended VR128 and VR256 for AVX-512 instructions +def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + 256, (sequence "YMM%u", 0, 31)>; + +def VK8 : RegisterClass<"X86", [v8i1], 8, (sequence "K%u", 0, 7)>; +def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)>; + +def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)>; +def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>; + diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td index 84c9203..9748261 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td @@ -16,10 +16,13 @@ def HaswellModel : SchedMachineModel { // All x86 instructions are modeled as a single micro-op, and HW can decode 4 // instructions per cycle. let IssueWidth = 4; - let MinLatency = 0; // 0 = Out-of-order execution. + let MicroOpBufferSize = 192; // Based on the reorder buffer. let LoadLatency = 4; - let ILPWindow = 30; let MispredictPenalty = 16; + + // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; } let SchedModel = HaswellModel in { @@ -50,6 +53,12 @@ def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>; def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>; def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>; +// 60 Entry Unified Scheduler +def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4, + HWPort5, HWPort6, HWPort7]> { + let BufferSize=60; +} + // Integer division issued on port 0. def HWDivider : ProcResource<1>; @@ -86,6 +95,7 @@ def : WriteRes<WriteZero, []>; defm : HWWriteResPair<WriteALU, HWPort0156, 1>; defm : HWWriteResPair<WriteIMul, HWPort1, 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } defm : HWWriteResPair<WriteShift, HWPort056, 1>; defm : HWWriteResPair<WriteJump, HWPort5, 1>; diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td index b36b3ad..3011c6d 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -17,10 +17,13 @@ def SandyBridgeModel : SchedMachineModel { // instructions per cycle. // FIXME: Identify instructions that aren't a single fused micro-op. let IssueWidth = 4; - let MinLatency = 0; // 0 = Out-of-order execution. + let MicroOpBufferSize = 168; // Based on the reorder buffer. let LoadLatency = 4; - let ILPWindow = 20; let MispredictPenalty = 16; + + // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; } let SchedModel = SandyBridgeModel in { @@ -46,6 +49,11 @@ def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>; def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>; def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>; +// 54 Entry Unified Scheduler +def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> { + let BufferSize=54; +} + // Integer division issued on port 0. def SBDivider : ProcResource<1>; @@ -82,6 +90,7 @@ def : WriteRes<WriteZero, []>; defm : SBWriteResPair<WriteALU, SBPort015, 1>; defm : SBWriteResPair<WriteIMul, SBPort1, 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } defm : SBWriteResPair<WriteShift, SBPort05, 1>; defm : SBWriteResPair<WriteJump, SBPort5, 1>; diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td index 9fbde88..0556437 100644 --- a/contrib/llvm/lib/Target/X86/X86Schedule.td +++ b/contrib/llvm/lib/Target/X86/X86Schedule.td @@ -42,6 +42,7 @@ multiclass X86SchedWritePair { // Arithmetic. defm WriteALU : X86SchedWritePair; // Simple integer ALU op. defm WriteIMul : X86SchedWritePair; // Integer multiplication. +def WriteIMulH : SchedWrite; // Integer multiplication, high part. defm WriteIDiv : X86SchedWritePair; // Integer division. def WriteLEA : SchedWrite; // LEA instructions can't fold loads. @@ -140,9 +141,12 @@ def IIC_IDIV64 : InstrItinClass; // neg/not/inc/dec def IIC_UNARY_REG : InstrItinClass; def IIC_UNARY_MEM : InstrItinClass; -// add/sub/and/or/xor/adc/sbc/cmp/test +// add/sub/and/or/xor/sbc/cmp/test def IIC_BIN_MEM : InstrItinClass; def IIC_BIN_NONMEM : InstrItinClass; +// adc/sbc +def IIC_BIN_CARRY_MEM : InstrItinClass; +def IIC_BIN_CARRY_NONMEM : InstrItinClass; // shift/rotate def IIC_SR : InstrItinClass; // shift double @@ -249,11 +253,11 @@ def IIC_SSE_INTSH_P_RR : InstrItinClass; def IIC_SSE_INTSH_P_RM : InstrItinClass; def IIC_SSE_INTSH_P_RI : InstrItinClass; -def IIC_SSE_CMPP_RR : InstrItinClass; -def IIC_SSE_CMPP_RM : InstrItinClass; +def IIC_SSE_INTSHDQ_P_RI : InstrItinClass; def IIC_SSE_SHUFP : InstrItinClass; -def IIC_SSE_PSHUF : InstrItinClass; +def IIC_SSE_PSHUF_RI : InstrItinClass; +def IIC_SSE_PSHUF_MI : InstrItinClass; def IIC_SSE_UNPCK : InstrItinClass; @@ -266,10 +270,14 @@ def IIC_SSE_PINSRW : InstrItinClass; def IIC_SSE_PABS_RR : InstrItinClass; def IIC_SSE_PABS_RM : InstrItinClass; -def IIC_SSE_SQRTP_RR : InstrItinClass; -def IIC_SSE_SQRTP_RM : InstrItinClass; -def IIC_SSE_SQRTS_RR : InstrItinClass; -def IIC_SSE_SQRTS_RM : InstrItinClass; +def IIC_SSE_SQRTPS_RR : InstrItinClass; +def IIC_SSE_SQRTPS_RM : InstrItinClass; +def IIC_SSE_SQRTSS_RR : InstrItinClass; +def IIC_SSE_SQRTSS_RM : InstrItinClass; +def IIC_SSE_SQRTPD_RR : InstrItinClass; +def IIC_SSE_SQRTPD_RM : InstrItinClass; +def IIC_SSE_SQRTSD_RR : InstrItinClass; +def IIC_SSE_SQRTSD_RM : InstrItinClass; def IIC_SSE_RCPP_RR : InstrItinClass; def IIC_SSE_RCPP_RM : InstrItinClass; @@ -311,7 +319,8 @@ def IIC_SSE_PSIGN_RM : InstrItinClass; def IIC_SSE_PMADD : InstrItinClass; def IIC_SSE_PMULHRSW : InstrItinClass; -def IIC_SSE_PALIGNR : InstrItinClass; +def IIC_SSE_PALIGNRR : InstrItinClass; +def IIC_SSE_PALIGNRM : InstrItinClass; def IIC_SSE_MWAIT : InstrItinClass; def IIC_SSE_MONITOR : InstrItinClass; @@ -487,8 +496,8 @@ def IIC_PUSH_REG : InstrItinClass; def IIC_PUSH_F : InstrItinClass; def IIC_PUSH_A : InstrItinClass; def IIC_BSWAP : InstrItinClass; -def IIC_BSF : InstrItinClass; -def IIC_BSR : InstrItinClass; +def IIC_BIT_SCAN_MEM : InstrItinClass; +def IIC_BIT_SCAN_REG : InstrItinClass; def IIC_MOVS : InstrItinClass; def IIC_STOS : InstrItinClass; def IIC_SCAS : InstrItinClass; @@ -535,6 +544,33 @@ def IIC_BOUND : InstrItinClass; def IIC_ARPL_REG : InstrItinClass; def IIC_ARPL_MEM : InstrItinClass; def IIC_MOVBE : InstrItinClass; +def IIC_AES : InstrItinClass; +def IIC_BLEND_MEM : InstrItinClass; +def IIC_BLEND_NOMEM : InstrItinClass; +def IIC_CBW : InstrItinClass; +def IIC_CRC32_REG : InstrItinClass; +def IIC_CRC32_MEM : InstrItinClass; +def IIC_SSE_DPPD_RR : InstrItinClass; +def IIC_SSE_DPPD_RM : InstrItinClass; +def IIC_SSE_DPPS_RR : InstrItinClass; +def IIC_SSE_DPPS_RM : InstrItinClass; +def IIC_MMX_EMMS : InstrItinClass; +def IIC_SSE_EXTRACTPS_RR : InstrItinClass; +def IIC_SSE_EXTRACTPS_RM : InstrItinClass; +def IIC_SSE_INSERTPS_RR : InstrItinClass; +def IIC_SSE_INSERTPS_RM : InstrItinClass; +def IIC_SSE_MPSADBW_RR : InstrItinClass; +def IIC_SSE_MPSADBW_RM : InstrItinClass; +def IIC_SSE_PMULLD_RR : InstrItinClass; +def IIC_SSE_PMULLD_RM : InstrItinClass; +def IIC_SSE_ROUNDPS_REG : InstrItinClass; +def IIC_SSE_ROUNDPS_MEM : InstrItinClass; +def IIC_SSE_ROUNDPD_REG : InstrItinClass; +def IIC_SSE_ROUNDPD_MEM : InstrItinClass; +def IIC_SSE_POPCNT_RR : InstrItinClass; +def IIC_SSE_POPCNT_RM : InstrItinClass; +def IIC_SSE_PCLMULQDQ_RR : InstrItinClass; +def IIC_SSE_PCLMULQDQ_RM : InstrItinClass; def IIC_NOP : InstrItinClass; @@ -546,8 +582,9 @@ def IIC_NOP : InstrItinClass; // Resources beyond the decoder operate on micro-ops and are bufferred // so adjacent micro-ops don't directly compete. // -// MinLatency=0 indicates that RAW dependencies can be decoded in the -// same cycle. +// MicroOpBufferSize > 1 indicates that RAW dependencies can be +// decoded in the same cycle. The value 32 is a reasonably arbitrary +// number of in-flight instructions. // // HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef // indicates high latency opcodes. Alternatively, InstrItinData @@ -555,19 +592,15 @@ def IIC_NOP : InstrItinClass; // latencies. Since these latencies are not used for pipeline hazards, // they do not need to be exact. // -// ILPWindow=10 is an arbitrary threshold that approximates cycles of -// latency hidden by instruction buffers. The actual value is not very -// important but should be zero for inorder and nonzero for OOO processors. -// -// The GenericModel contains no instruciton itineraries. +// The GenericModel contains no instruction itineraries. def GenericModel : SchedMachineModel { let IssueWidth = 4; - let MinLatency = 0; + let MicroOpBufferSize = 32; let LoadLatency = 4; let HighLatency = 10; - let ILPWindow = 10; } include "X86ScheduleAtom.td" include "X86SchedSandyBridge.td" include "X86SchedHaswell.td" +include "X86ScheduleSLM.td" diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td index cce8f1b..ba72f29 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This file defines the itinerary class data for the Intel Atom (Bonnell) -// processors. +// This file defines the itinerary class data for the Intel Atom +// in order (Saltwell-32nm/Bonnell-45nm) processors. // //===----------------------------------------------------------------------===// @@ -79,9 +79,12 @@ def AtomItineraries : ProcessorItineraries< // neg/not/inc/dec InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >, - // add/sub/and/or/xor/adc/sbc/cmp/test + // add/sub/and/or/xor/cmp/test InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >, + // adc/sbc + InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >, // shift/rotate InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >, // shift double @@ -203,18 +206,23 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >, InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >, - InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >, - InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >, - InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >, InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >, - InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >, - InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >, - InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >, - InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >, @@ -273,7 +281,8 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >, InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >, - InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >, InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >, InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >, @@ -465,8 +474,8 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >, InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >, - InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >, - InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >, + InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >, + InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >, InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >, InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >, @@ -513,6 +522,8 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >, InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >, InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >, InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] > ]>; @@ -520,11 +531,9 @@ def AtomItineraries : ProcessorItineraries< // Atom machine model. def AtomModel : SchedMachineModel { let IssueWidth = 2; // Allows 2 instructions per scheduling group. - let MinLatency = 1; // InstrStage cycles overrides MinLatency. - // OperandCycles may be used for expected latency. + let MicroOpBufferSize = 0; // In-order execution, always hide latency. let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. let HighLatency = 30;// Expected, may be overriden by OperandCycles. - let ILPWindow = 0; // Always try to hide expected latency. let Itineraries = AtomItineraries; } diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td new file mode 100644 index 0000000..6c2a304 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -0,0 +1,668 @@ +//===- X86ScheduleSLM.td - X86 Atom Scheduling Definitions -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Intel Atom +// (Silvermont) processor. +// +//===----------------------------------------------------------------------===// + +def IEC_RSV0 : FuncUnit; +def IEC_RSV1 : FuncUnit; +def FPC_RSV0 : FuncUnit; +def FPC_RSV1 : FuncUnit; +def MEC_RSV : FuncUnit; + + + + + + + + + + + + + + +def SLMItineraries : ProcessorItineraries< + [ IEC_RSV0, IEC_RSV1, FPC_RSV0, FPC_RSV1, MEC_RSV ], + [], [ + // [InstrStage<N, [FPC_RSV0, FPC_RSV1]>] + // [InstrStage<N, [FPC_RSV0, FPC_RSV1], 0>, InstrStage<N, [MEC_RSV]>] + // [InstrStage<N, [IEC_RSV0, IEC_RSV1]>] + // [InstrStage<N, [IEC_RSV0, IEC_RSV1], 0>,InstrStage<N,[MEC_RSV]>] + // + // Default is 1 cycle, IEC_RSV0 or IEC_RSV1 + //InstrItinData<IIC_DEFAULT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_ALU_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LEA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LEA_16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // mul + InstrItinData<IIC_MUL8, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MUL16_MEM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_MUL16_REG, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MUL32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<3, [MEC_RSV]>] >, + InstrItinData<IIC_MUL32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MUL64, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, + // imul by al, ax, eax, rax + InstrItinData<IIC_IMUL8, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IMUL16_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<6, [MEC_RSV]>] >, + InstrItinData<IIC_IMUL16_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IMUL32_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<6, [MEC_RSV]>] >, + InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IMUL64, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, + // imul reg by reg|mem + InstrItinData<IIC_IMUL16_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_IMUL16_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IMUL32_RM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<3, [MEC_RSV]>] >, + InstrItinData<IIC_IMUL32_RR, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IMUL64_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_IMUL64_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, + // imul reg = reg/mem * imm + InstrItinData<IIC_IMUL16_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IMUL32_RRI, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IMUL64_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IMUL16_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_IMUL32_RMI, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<3, [MEC_RSV]>] >, + InstrItinData<IIC_IMUL64_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + // idiv - min latency + InstrItinData<IIC_IDIV8, [InstrStage<34, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IDIV16, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IDIV32, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IDIV64, [InstrStage<49, [IEC_RSV0, IEC_RSV1]>] >, + // div - min latency + InstrItinData<IIC_DIV8_REG, [InstrStage<25, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_DIV8_MEM, [InstrStage<25, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<25, [MEC_RSV]>] >, + InstrItinData<IIC_DIV16, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_DIV32, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_DIV64, [InstrStage<38, [IEC_RSV0, IEC_RSV1]>] >, + // neg/not/inc/dec + InstrItinData<IIC_UNARY_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + // add/sub/and/or/xor/adc/sbc/cmp/test + InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_BIN_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + // adc/sbb + InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<2, [MEC_RSV]>] >, + // shift/rotate + InstrItinData<IIC_SR, [InstrStage<1, [IEC_RSV0], 0>, + InstrStage<1, [MEC_RSV]>] >, + // shift double + InstrItinData<IIC_SHD16_REG_IM, [InstrStage<2, [IEC_RSV0]>] >, + InstrItinData<IIC_SHD16_REG_CL, [InstrStage<4, [IEC_RSV0]>] >, + InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [IEC_RSV0]>] >, + InstrItinData<IIC_SHD32_REG_CL, [InstrStage<4, [IEC_RSV0]>] >, + InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SHD64_REG_IM, [InstrStage<2, [IEC_RSV0]>] >, + InstrItinData<IIC_SHD64_REG_CL, [InstrStage<4, [IEC_RSV0]>] >, + InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>, + InstrStage<4, [MEC_RSV]>] >, + // cmov + InstrItinData<IIC_CMOV16_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_CMOV16_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMOV32_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_CMOV32_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMOV64_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_CMOV64_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >, + // set + InstrItinData<IIC_SET_M, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SET_R, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // jcc + InstrItinData<IIC_Jcc, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // jcxz/jecxz/jrcxz + InstrItinData<IIC_JCXZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // jmp rel + InstrItinData<IIC_JMP_REL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // jmp indirect + InstrItinData<IIC_JMP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_JMP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + // jmp far + InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // loop/loope/loopne + InstrItinData<IIC_LOOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LOOPE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LOOPNE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // call - all but reg/imm + InstrItinData<IIC_CALL_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CALL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + //ret + InstrItinData<IIC_RET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_RET_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + //sign extension movs + InstrItinData<IIC_MOVSX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + //zero extension movs + InstrItinData<IIC_MOVZX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + + InstrItinData<IIC_REP_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_REP_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + // SSE binary operations + // arithmetic fp scalar + InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<3, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<3, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<13, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<13, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + + InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<6, [MEC_RSV]>] >, + + // arithmetic fp parallel + InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<3, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<27, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<27, [MEC_RSV]>] >, + + // bitwise parallel + InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + + // arithmetic int parallel + InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + + // multiply int parallel + InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [FPC_RSV0], 0>, + InstrStage<5, [MEC_RSV]>] >, + + // shift parallel + InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<2, [FPC_RSV0], 0>, + InstrStage<2, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [FPC_RSV0]>] >, + + InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [FPC_RSV0]>] >, + + InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [FPC_RSV0], 0>, + InstrStage<1, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [FPC_RSV0]>] >, + + InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<26, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<26, [FPC_RSV0], 0>, + InstrStage<26, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<13, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<13, [FPC_RSV0], 0>, + InstrStage<13, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<26, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<26, [FPC_RSV0], 0>, + InstrStage<26, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<13, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<13, [FPC_RSV0], 0>, + InstrStage<13, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<9, [FPC_RSV0], 0>, + InstrStage<9, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [FPC_RSV0]>] >, + InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [FPC_RSV0], 0>, + InstrStage<4, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_MOVMSK, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MASKMOV, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, + + InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + + InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + + InstrItinData<IIC_SSE_LDDQU, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + + InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + + InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SSE_PAUSE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SSE_STMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<6, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<9, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<9, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<9, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<5, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + + InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MWAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SSE_MONITOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + // conversions + // to/from PD ... + InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<5, [MEC_RSV]>] >, + // to/from PS except to/from PD and PS2PI + InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + + // MMX MOVs + InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // other MMX + InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PMUL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PSADBW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PEXTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // conversions + // from/to PD + InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // from/to PI + InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_CMPX_LOCK, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_FILD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FLD80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_FST, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FST80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FIST, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_FLDZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FUCOM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FUCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FNSTSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FNSTCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FLDCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FNINIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FFREE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FNCLEX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_WAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FXAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FNOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FLDL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_F2XM1, [InstrStage<88, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_FYL2X, [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_FPTAN, [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_FPATAN, [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_FXTRACT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FPREM1, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FPSTP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FPREM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FYL2XP1, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FSINCOS, [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_FRNDINT, [InstrStage<25, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_FSCALE, [InstrStage<74, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_FCOMPP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FXSAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FXRSTOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_FXCH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + + // System instructions + InstrItinData<IIC_CPUID, [InstrStage<60, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_INT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_INT3, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_INVD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_INVLPG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IRET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_HLT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LXS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_RDTSC, [InstrStage<30, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_RSM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SLDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_STR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SWAPGS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SYSCALL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_IN_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_IN_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_OUT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_OUT_IR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_INS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_MOV_REG_DR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOV_DR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // worst case for mov REG_CRx + InstrItinData<IIC_MOV_REG_CR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOV_CR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOV_MEM_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOV_SR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOV_SR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // LAR + InstrItinData<IIC_LAR_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LAR_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // LSL + InstrItinData<IIC_LSL_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LSL_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_LGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LLDT_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LLDT_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // push control register, segment registers + InstrItinData<IIC_PUSH_CS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_PUSH_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // pop control register, segment registers + InstrItinData<IIC_POP_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_POP_SR_SS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // VERR, VERW + InstrItinData<IIC_VERR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_VERW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_VERW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // WRMSR, RDMSR + InstrItinData<IIC_WRMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_RDMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_RDPMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + // SMSW, LMSW + InstrItinData<IIC_SMSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LMSW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LMSW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_ENTER, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LEAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_POP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_POP_REG16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_POP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_POP_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_POP_FD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_POP_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_PUSH_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_PUSH_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_PUSH_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_PUSH_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + + InstrItinData<IIC_BSWAP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<10, [MEC_RSV]>] >, + InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_SCAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMPS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_MOV_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_AHF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_BT_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_BT_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_BT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_BT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_BTX_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_BTX_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_BTX_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_BTX_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_XCHG_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_XCHG_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<5, [MEC_RSV]>] >, + InstrItinData<IIC_XADD_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_XADD_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<5, [MEC_RSV]>] >, + InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMPXCHG_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<6, [MEC_RSV]>] >, + InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<6, [MEC_RSV]>] >, + InstrItinData<IIC_CMPXCHG_8B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMPXCHG_16B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_LODS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_OUTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CLC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CLI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CLTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_STC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_STI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_STD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_XLAT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_AAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_AAD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_AAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_AAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_DAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_DAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_BOUND, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_ARPL_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_ARPL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_MOVBE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_AES, [InstrStage<8, [FPC_RSV0]>] >, + InstrItinData<IIC_BLEND_NOMEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_BLEND_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<10, [MEC_RSV]>] >, + InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CBW, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CRC32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >, + InstrItinData<IIC_CRC32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>, + InstrStage<3, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_DPPD_RR, [InstrStage<12, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_DPPD_RM, [InstrStage<12, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<12, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_DPPS_RR, [InstrStage<15, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_DPPS_RM, [InstrStage<15, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<15, [MEC_RSV]>] >, + InstrItinData<IIC_MMX_EMMS, [InstrStage<10, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_EXTRACTPS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_EXTRACTPS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_INSERTPS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_INSERTPS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_MPSADBW_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_MPSADBW_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<1, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_PMULLD_RR, [InstrStage<11, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_PMULLD_RM, [InstrStage<11, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<11, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_ROUNDPS_REG, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_ROUNDPS_MEM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<5, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_ROUNDPD_REG, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >, + InstrItinData<IIC_SSE_ROUNDPD_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_POPCNT_RR, [InstrStage<4, [IEC_RSV1]>] >, + InstrItinData<IIC_SSE_POPCNT_RM, [InstrStage<4, [IEC_RSV1], 0>, + InstrStage<4, [MEC_RSV]>] >, + InstrItinData<IIC_SSE_PCLMULQDQ_RR, [InstrStage<10, [IEC_RSV1]>] >, + InstrItinData<IIC_SSE_PCLMULQDQ_RM, [InstrStage<10, [IEC_RSV1], 0>, + InstrStage<10, [MEC_RSV]>] >, + + InstrItinData<IIC_NOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] > + ]>; + +// Silvermont machine model. +def SLMModel : SchedMachineModel { + let IssueWidth = 2; // Allows 2 instructions per scheduling group. + let MinLatency = 1; // InstrStage cycles overrides MinLatency. + // OperandCycles may be used for expected latency. + let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. + let HighLatency = 30;// Expected, may be overriden by OperandCycles. + + let Itineraries = SLMItineraries; +} diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index f934fdd..b9c620f 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -27,7 +27,7 @@ X86SelectionDAGInfo::~X86SelectionDAGInfo() { } SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, +X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, @@ -46,8 +46,6 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, !ConstantSize || ConstantSize->getZExtValue() > Subtarget->getMaxInlineSizeThreshold()) { - SDValue InFlag(0, 0); - // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); @@ -175,7 +173,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, } SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, +X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h index d1d66fe..d728af5 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -34,7 +34,7 @@ public: ~X86SelectionDAGInfo(); virtual - SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, @@ -42,7 +42,7 @@ public: MachinePointerInfo DstPtrInfo) const; virtual - SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index 74da2a9..01353b2 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -276,20 +276,29 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX (Family == 6 && Model == 0x2A) || // SandyBridge (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E* - (Family == 6 && Model == 0x3A))) {// IvyBridge + (Family == 6 && Model == 0x3A) || // IvyBridge + (Family == 6 && Model == 0x3E) || // IvyBridge EP + (Family == 6 && Model == 0x3C) || // Haswell + (Family == 6 && Model == 0x3F) || // ... + (Family == 6 && Model == 0x45) || // ... + (Family == 6 && Model == 0x46))) { // ... IsUAMemFast = true; ToggleFeature(X86::FeatureFastUAMem); } - // Set processor type. Currently only Atom is detected. + // Set processor type. Currently only Atom or Silvermont (SLM) is detected. if (Family == 6 && - (Model == 28 || Model == 38 || Model == 39 - || Model == 53 || Model == 54)) { + (Model == 28 || Model == 38 || Model == 39 || + Model == 53 || Model == 54)) { X86ProcFamily = IntelAtom; UseLeaForSP = true; ToggleFeature(X86::FeatureLeaForSP); } + else if (Family == 6 && + (Model == 55 || Model == 74 || Model == 77)) { + X86ProcFamily = IntelSLM; + } unsigned MaxExtLevel; X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX); @@ -351,14 +360,38 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasRTM = true; ToggleFeature(X86::FeatureRTM); } - if (IsIntel && ((EBX >> 19) & 0x1)) { - HasADX = true; - ToggleFeature(X86::FeatureADX); + if (IsIntel && ((EBX >> 16) & 0x1)) { + X86SSELevel = AVX512F; + ToggleFeature(X86::FeatureAVX512); } if (IsIntel && ((EBX >> 18) & 0x1)) { HasRDSEED = true; ToggleFeature(X86::FeatureRDSEED); } + if (IsIntel && ((EBX >> 19) & 0x1)) { + HasADX = true; + ToggleFeature(X86::FeatureADX); + } + if (IsIntel && ((EBX >> 26) & 0x1)) { + HasPFI = true; + ToggleFeature(X86::FeaturePFI); + } + if (IsIntel && ((EBX >> 27) & 0x1)) { + HasERI = true; + ToggleFeature(X86::FeatureERI); + } + if (IsIntel && ((EBX >> 28) & 0x1)) { + HasCDI = true; + ToggleFeature(X86::FeatureCDI); + } + if (IsIntel && ((EBX >> 29) & 0x1)) { + HasSHA = true; + ToggleFeature(X86::FeatureSHA); + } + } + if (IsAMD && ((ECX >> 21) & 0x1)) { + HasTBM = true; + ToggleFeature(X86::FeatureTBM); } } } @@ -416,8 +449,8 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { // Make sure 64-bit features are available in 64-bit mode. if (In64BitMode) { - HasX86_64 = true; ToggleFeature(X86::Feature64Bit); - HasCMov = true; ToggleFeature(X86::FeatureCMOV); + if (!HasX86_64) { HasX86_64 = true; ToggleFeature(X86::Feature64Bit); } + if (!HasCMov) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); } if (X86SSELevel < SSE2) { X86SSELevel = SSE2; @@ -429,9 +462,9 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { // CPUName may have been set by the CPU detection code. Make sure the // new MCSchedModel is used. - InitMCProcessorInfo(CPUName, FS); + InitCPUSchedModel(CPUName); - if (X86ProcFamily == IntelAtom) + if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM) PostRAScheduler = true; InstrItins = getInstrItineraryForCPU(CPUName); @@ -468,6 +501,7 @@ void X86Subtarget::initializeEnvironment() { HasFMA = false; HasFMA4 = false; HasXOP = false; + HasTBM = false; HasMOVBE = false; HasRDRAND = false; HasF16C = false; @@ -477,7 +511,11 @@ void X86Subtarget::initializeEnvironment() { HasBMI2 = false; HasRTM = false; HasHLE = false; + HasERI = false; + HasCDI = false; + HasPFI = false; HasADX = false; + HasSHA = false; HasPRFCHW = false; HasRDSEED = false; IsBTMemSlow = false; diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index 01a28d0..dd8c081 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -42,7 +42,7 @@ enum Style { class X86Subtarget : public X86GenSubtargetInfo { protected: enum X86SSEEnum { - NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2 + NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F }; enum X863DNowEnum { @@ -50,7 +50,7 @@ protected: }; enum X86ProcFamilyEnum { - Others, IntelAtom + Others, IntelAtom, IntelSLM }; /// X86ProcFamily - X86 processor family: Intel Atom, and others @@ -97,6 +97,9 @@ protected: /// HasXOP - Target has XOP instructions bool HasXOP; + /// HasTBM - Target has TBM instructions. + bool HasTBM; + /// HasMOVBE - True if the processor has the MOVBE instruction. bool HasMOVBE; @@ -127,6 +130,9 @@ protected: /// HasADX - Processor has ADX instructions. bool HasADX; + /// HasSHA - Processor has SHA instructions. + bool HasSHA; + /// HasPRFCHW - Processor has PRFCHW instructions. bool HasPRFCHW; @@ -169,6 +175,15 @@ protected: /// address generation (AG) time. bool LEAUsesAG; + /// Processor has AVX-512 PreFetch Instructions + bool HasPFI; + + /// Processor has AVX-512 Exponential and Reciprocal Instructions + bool HasERI; + + /// Processor has AVX-512 Conflict Detection Instructions + bool HasCDI; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -249,6 +264,7 @@ public: bool hasSSE42() const { return X86SSELevel >= SSE42; } bool hasAVX() const { return X86SSELevel >= AVX; } bool hasAVX2() const { return X86SSELevel >= AVX2; } + bool hasAVX512() const { return X86SSELevel >= AVX512F; } bool hasFp256() const { return hasAVX(); } bool hasInt256() const { return hasAVX2(); } bool hasSSE4A() const { return HasSSE4A; } @@ -261,6 +277,7 @@ public: // FIXME: Favor FMA when both are enabled. Is this the right thing to do? bool hasFMA4() const { return HasFMA4 && !HasFMA; } bool hasXOP() const { return HasXOP; } + bool hasTBM() const { return HasTBM; } bool hasMOVBE() const { return HasMOVBE; } bool hasRDRAND() const { return HasRDRAND; } bool hasF16C() const { return HasF16C; } @@ -271,6 +288,7 @@ public: bool hasRTM() const { return HasRTM; } bool hasHLE() const { return HasHLE; } bool hasADX() const { return HasADX; } + bool hasSHA() const { return HasSHA; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } bool isBTMemSlow() const { return IsBTMemSlow; } @@ -282,6 +300,9 @@ public: bool padShortFunctions() const { return PadShortFunctions; } bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } + bool hasCDI() const { return HasCDI; } + bool hasPFI() const { return HasPFI; } + bool hasERI() const { return HasERI; } bool isAtom() const { return X86ProcFamily == IntelAtom; } @@ -298,10 +319,8 @@ public: return (TargetTriple.getEnvironment() == Triple::ELF || TargetTriple.isOSBinFormatELF()); } - bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } - bool isTargetNaCl() const { - return TargetTriple.getOS() == Triple::NaCl; - } + bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; } @@ -314,15 +333,14 @@ public: } bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); } + bool isOSWindows() const { return TargetTriple.isOSWindows(); } + bool isTargetWin64() const { - // FIXME: x86_64-cygwin has not been released yet. return In64BitMode && TargetTriple.isOSWindows(); } bool isTargetWin32() const { - // FIXME: Cygwin is included for isTargetWin64 -- should it be included - // here too? - return !In64BitMode && (isTargetMingw() || isTargetWindows()); + return !In64BitMode && (isTargetCygMing() || isTargetWindows()); } bool isPICStyleSet() const { return PICStyle != PICStyles::None; } @@ -367,11 +385,14 @@ public: /// memset with zero passed as the second argument. Otherwise it /// returns null. const char *getBZeroEntry() const; - + /// This function returns true if the target has sincos() routine in its /// compiler runtime or math libraries. bool hasSinCos() const; + /// Enable the MachineScheduler pass for all X86 subtargets. + bool enableMachineScheduler() const LLVM_OVERRIDE { return true; } + /// enablePostRAScheduler - run for Atom optimization. bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, TargetSubtargetInfo::AntiDepBreakMode& Mode, diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index 00fa47f..ddf580f 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -49,6 +49,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, TLInfo(*this), TSInfo(*this), JITInfo(*this) { + initAsmInfo(); } void X86_64TargetMachine::anchor() { } @@ -69,6 +70,7 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, TLInfo(*this), TSInfo(*this), JITInfo(*this) { + initAsmInfo(); } /// X86TargetMachine ctor - Create an X86 target. @@ -90,7 +92,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, } else if (Subtarget.is64Bit()) { // PIC in 64 bit mode is always rip-rel. Subtarget.setPICStyle(PICStyles::RIPRel); - } else if (Subtarget.isTargetCygMing()) { + } else if (Subtarget.isTargetCOFF()) { Subtarget.setPICStyle(PICStyles::None); } else if (Subtarget.isTargetDarwin()) { if (getRelocationModel() == Reloc::PIC_) @@ -112,14 +114,14 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, // Command line options for x86 //===----------------------------------------------------------------------===// static cl::opt<bool> -UseVZeroUpper("x86-use-vzeroupper", +UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, cl::desc("Minimize AVX to SSE transition penalty"), cl::init(true)); // Temporary option to control early if-conversion for x86 while adding machine // models. static cl::opt<bool> -X86EarlyIfConv("x86-early-ifcvt", +X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, cl::desc("Enable early if-conversion on X86")); //===----------------------------------------------------------------------===// @@ -130,7 +132,7 @@ void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) { // Add first the target-independent BasicTTI pass, then our X86 pass. This // allows the X86 pass to delegate to the target independent layer when // appropriate. - PM.add(createBasicTargetTransformInfoPass(getTargetLowering())); + PM.add(createBasicTargetTransformInfoPass(this)); PM.add(createX86TargetTransformInfoPass(this)); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp index 871dacd..086cd4d 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -25,7 +25,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang, // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which // is an indirect pc-relative reference. if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) { - const MCSymbol *Sym = Mang->getSymbol(GV); + const MCSymbol *Sym = getSymbol(*Mang, GV); const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); const MCExpr *Four = MCConstantExpr::Create(4, getContext()); @@ -39,7 +39,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang, MCSymbol *X86_64MachoTargetObjectFile:: getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, MachineModuleInfo *MMI) const { - return Mang->getSymbol(GV); + return getSymbol(*Mang, GV); } void @@ -47,3 +47,9 @@ X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); InitializeELF(TM.Options.UseInitArray); } + +const MCExpr * +X86LinuxTargetObjectFile::getDebugThreadLocalSymbol( + const MCSymbol *Sym) const { + return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h index 9d26d38..79c861d 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -36,6 +36,9 @@ namespace llvm { /// and x86-64. class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF { virtual void Initialize(MCContext &Ctx, const TargetMachine &TM); + + /// \brief Describe a TLS variable address within debug info. + virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index eba9d78..f88a666 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -33,7 +33,6 @@ void initializeX86TTIPass(PassRegistry &); namespace { class X86TTI : public ImmutablePass, public TargetTransformInfo { - const X86TargetMachine *TM; const X86Subtarget *ST; const X86TargetLowering *TLI; @@ -42,12 +41,12 @@ class X86TTI : public ImmutablePass, public TargetTransformInfo { unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; public: - X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { + X86TTI() : ImmutablePass(ID), ST(0), TLI(0) { llvm_unreachable("This pass cannot be directly constructed"); } X86TTI(const X86TargetMachine *TM) - : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), + : ImmutablePass(ID), ST(TM->getSubtargetImpl()), TLI(TM->getTargetLowering()) { initializeX86TTIPass(*PassRegistry::getPassRegistry()); } @@ -101,6 +100,11 @@ public: unsigned Alignment, unsigned AddressSpace) const; + virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const; + + virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm) const; + /// @} }; @@ -126,8 +130,8 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); // TODO: Currently the __builtin_popcount() implementation using SSE3 // instructions is inefficient. Once the problem is fixed, we should - // call ST->hasSSE3() instead of ST->hasSSE4(). - return ST->hasSSE41() ? PSK_FastHardware : PSK_Software; + // call ST->hasSSE3() instead of ST->hasPOPCNT(). + return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; } unsigned X86TTI::getNumberOfRegisters(bool Vector) const { @@ -173,7 +177,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - static const CostTblEntry<MVT> AVX2CostTable[] = { + static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. { ISD::SHL, MVT::v4i32, 1 }, @@ -196,17 +200,27 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v32i8, 32*20 }, + { ISD::SDIV, MVT::v16i16, 16*20 }, + { ISD::SDIV, MVT::v8i32, 8*20 }, + { ISD::SDIV, MVT::v4i64, 4*20 }, + { ISD::UDIV, MVT::v32i8, 32*20 }, + { ISD::UDIV, MVT::v16i16, 16*20 }, + { ISD::UDIV, MVT::v8i32, 8*20 }, + { ISD::UDIV, MVT::v4i64, 4*20 }, }; // Look for AVX2 lowering tricks. if (ST->hasAVX2()) { - int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable), - ISD, LT.second); + int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); if (Idx != -1) return LT.first * AVX2CostTable[Idx].Cost; } - static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = { + static const CostTblEntry<MVT::SimpleValueType> + SSE2UniformConstCostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. // Constant splats are cheaper for the following instructions. @@ -227,15 +241,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasSSE2()) { - int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable, - array_lengthof(SSE2UniformConstCostTable), - ISD, LT.second); + int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); if (Idx != -1) return LT.first * SSE2UniformConstCostTable[Idx].Cost; } - static const CostTblEntry<MVT> SSE2CostTable[] = { + static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. // For some cases, where the shift amount is a scalar we would be able @@ -258,16 +270,30 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. + + // It is not a good idea to vectorize division. We have to scalarize it and + // in the process we will often end up having to spilling regular + // registers. The overhead of division is going to dominate most kernels + // anyways so try hard to prevent vectorization of division - it is + // generally a bad idea. Assume somewhat arbitrarily that we have to be able + // to hide "20 cycles" for each lane. + { ISD::SDIV, MVT::v16i8, 16*20 }, + { ISD::SDIV, MVT::v8i16, 8*20 }, + { ISD::SDIV, MVT::v4i32, 4*20 }, + { ISD::SDIV, MVT::v2i64, 2*20 }, + { ISD::UDIV, MVT::v16i8, 16*20 }, + { ISD::UDIV, MVT::v8i16, 8*20 }, + { ISD::UDIV, MVT::v4i32, 4*20 }, + { ISD::UDIV, MVT::v2i64, 2*20 }, }; if (ST->hasSSE2()) { - int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable), - ISD, LT.second); + int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second); if (Idx != -1) return LT.first * SSE2CostTable[Idx].Cost; } - static const CostTblEntry<MVT> AVX1CostTable[] = { + static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = { // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. @@ -286,21 +312,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // Look for AVX1 lowering tricks. if (ST->hasAVX() && !ST->hasAVX2()) { - int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable), - ISD, LT.second); + int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second); if (Idx != -1) return LT.first * AVX1CostTable[Idx].Cost; } // Custom lowering of vectors. - static const CostTblEntry<MVT> CustomLowered[] = { + static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = { // A v2i64/v4i64 and multiply is custom lowered as a series of long // multiplies(3), shifts(4) and adds(2). { ISD::MUL, MVT::v2i64, 9 }, { ISD::MUL, MVT::v4i64, 9 }, }; - int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered), - ISD, LT.second); + int Idx = CostTableLookup(CustomLowered, ISD, LT.second); if (Idx != -1) return LT.first * CustomLowered[Idx].Cost; @@ -337,7 +361,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); - static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = { + static const TypeConversionCostTblEntry<MVT::SimpleValueType> + SSE2ConvTbl[] = { // These are somewhat magic numbers justified by looking at the output of // Intel's IACA, running some kernels and making sure when we take // legalization into account the throughput will be overestimated. @@ -361,9 +386,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { }; if (ST->hasSSE2() && !ST->hasAVX()) { - int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl, - array_lengthof(SSE2ConvTbl), - ISD, LTDest.second, LTSrc.second); + int Idx = + ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second); if (Idx != -1) return LTSrc.first * SSE2ConvTbl[Idx].Cost; } @@ -375,13 +399,17 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { if (!SrcTy.isSimple() || !DstTy.isSimple()) return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); - static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = { + static const TypeConversionCostTblEntry<MVT::SimpleValueType> + AVXConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, @@ -420,9 +448,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { }; if (ST->hasAVX()) { - int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl, - array_lengthof(AVXConversionTbl), - ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); + int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), + SrcTy.getSimpleVT()); if (Idx != -1) return AVXConversionTbl[Idx].Cost; } @@ -440,7 +467,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - static const CostTblEntry<MVT> SSE42CostTbl[] = { + static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = { { ISD::SETCC, MVT::v2f64, 1 }, { ISD::SETCC, MVT::v4f32, 1 }, { ISD::SETCC, MVT::v2i64, 1 }, @@ -449,7 +476,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i8, 1 }, }; - static const CostTblEntry<MVT> AVX1CostTbl[] = { + static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = { { ISD::SETCC, MVT::v4f64, 1 }, { ISD::SETCC, MVT::v8f32, 1 }, // AVX1 does not support 8-wide integer compare. @@ -459,7 +486,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v32i8, 4 }, }; - static const CostTblEntry<MVT> AVX2CostTbl[] = { + static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = { { ISD::SETCC, MVT::v4i64, 1 }, { ISD::SETCC, MVT::v8i32, 1 }, { ISD::SETCC, MVT::v16i16, 1 }, @@ -467,19 +494,19 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, }; if (ST->hasAVX2()) { - int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); + int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); if (Idx != -1) return LT.first * AVX2CostTbl[Idx].Cost; } if (ST->hasAVX()) { - int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); + int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy); if (Idx != -1) return LT.first * AVX1CostTbl[Idx].Cost; } if (ST->hasSSE42()) { - int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); + int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy); if (Idx != -1) return LT.first * SSE42CostTbl[Idx].Cost; } @@ -511,8 +538,51 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); } +unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, + bool Extract) const { + assert (Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; +} + unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const { + // Handle non power of two vectors such as <3 x float> + if (VectorType *VTy = dyn_cast<VectorType>(Src)) { + unsigned NumElem = VTy->getVectorNumElements(); + + // Handle a few common cases: + // <3 x float> + if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) + // Cost = 64 bit store + extract + 32 bit store. + return 3; + + // <3 x double> + if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) + // Cost = 128 bit store + unpack + 64 bit store. + return 3; + + // Assume that all other non power-of-two numbers are scalarized. + if (!isPowerOf2_32(NumElem)) { + unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, + VTy->getScalarType(), + Alignment, + AddressSpace); + unsigned SplitCost = getScalarizationOverhead(Src, + Opcode == Instruction::Load, + Opcode==Instruction::Store); + return NumElem * Cost + SplitCost; + } + } + // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && @@ -528,3 +598,97 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, return Cost; } + +unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { + // Address computations in vectorized code with non-consecutive addresses will + // likely result in more instructions compared to scalar code where the + // computation can more often be merged into the index mode. The resulting + // extra micro-ops can significantly decrease throughput. + unsigned NumVectorInstToHideOverhead = 10; + + if (Ty->isVectorTy() && IsComplex) + return NumVectorInstToHideOverhead; + + return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex); +} + +unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) const { + + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); + + MVT MTy = LT.second; + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput + // and make it as the cost. + + static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { + { ISD::FADD, MVT::v2f64, 2 }, + { ISD::FADD, MVT::v4f32, 4 }, + { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". + { ISD::ADD, MVT::v8i16, 5 }, + }; + + static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { + { ISD::FADD, MVT::v4f32, 4 }, + { ISD::FADD, MVT::v4f64, 5 }, + { ISD::FADD, MVT::v8f32, 7 }, + { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". + { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". + { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". + { ISD::ADD, MVT::v8i16, 5 }, + { ISD::ADD, MVT::v8i32, 5 }, + }; + + static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = { + { ISD::FADD, MVT::v2f64, 2 }, + { ISD::FADD, MVT::v4f32, 4 }, + { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". + { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". + }; + + static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { + { ISD::FADD, MVT::v4f32, 3 }, + { ISD::FADD, MVT::v4f64, 3 }, + { ISD::FADD, MVT::v8f32, 4 }, + { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". + { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". + { ISD::ADD, MVT::v4i64, 3 }, + { ISD::ADD, MVT::v8i16, 4 }, + { ISD::ADD, MVT::v8i32, 5 }, + }; + + if (IsPairwise) { + if (ST->hasAVX()) { + int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); + if (Idx != -1) + return LT.first * AVX1CostTblPairWise[Idx].Cost; + } + + if (ST->hasSSE42()) { + int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); + if (Idx != -1) + return LT.first * SSE42CostTblPairWise[Idx].Cost; + } + } else { + if (ST->hasAVX()) { + int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy); + if (Idx != -1) + return LT.first * AVX1CostTblNoPairWise[Idx].Cost; + } + + if (ST->hasSSE42()) { + int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); + if (Idx != -1) + return LT.first * SSE42CostTblNoPairWise[Idx].Cost; + } + } + + return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); +} + diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp index 0f77948..66ae9c2 100644 --- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -105,23 +105,28 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { } static bool isYmmReg(unsigned Reg) { - if (Reg >= X86::YMM0 && Reg <= X86::YMM15) - return true; + return (Reg >= X86::YMM0 && Reg <= X86::YMM31); +} - return false; +static bool isZmmReg(unsigned Reg) { + return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31); } static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) - if (isYmmReg(I->first)) + if (isYmmReg(I->first) || isZmmReg(I->first)) return true; return false; } static bool clobbersAllYmmRegs(const MachineOperand &MO) { - for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { + for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } + for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) { if (!MO.clobbersPhysReg(reg)) return false; } @@ -143,6 +148,25 @@ static bool hasYmmReg(MachineInstr *MI) { return false; } +/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this +/// instruction. +static bool clobbersAnyYmmReg(MachineInstr *MI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isRegMask()) + continue; + for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) { + if (MO.clobbersPhysReg(reg)) + return true; + } + for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) { + if (MO.clobbersPhysReg(reg)) + return true; + } + } + return false; +} + /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// vzero upper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { @@ -226,8 +250,9 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, bool BBHasCall = false; for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { - MachineInstr *MI = I; DebugLoc dl = I->getDebugLoc(); + MachineInstr *MI = I; + bool isControlFlow = MI->isCall() || MI->isReturn(); // Shortcut: don't need to check regular instructions in dirty state. @@ -246,6 +271,14 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, if (!isControlFlow) continue; + // If the call won't clobber any YMM register, skip it as well. It usually + // happens on helper function calls (such as '_chkstk', '_ftol2') where + // standard calling convention is not used (RegMask is not used to mark + // register clobbered and register usage (def/imp-def/use) is well-dfined + // and explicitly specified. + if (MI->isCall() && !clobbersAnyYmmReg(MI)) + continue; + BBHasCall = true; // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX |