diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
82 files changed, 18583 insertions, 8443 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 4e0ad8bf..e692118 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -59,6 +59,7 @@ class X86AsmParser : public MCTargetAsmParser { const MCInstrInfo &MII; ParseInstructionInfo *InstInfo; std::unique_ptr<X86AsmInstrumentation> Instrumentation; + bool Code16GCC; private: SMLoc consumeToken() { @@ -68,6 +69,19 @@ private: return Result; } + unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst, + uint64_t &ErrorInfo, bool matchingInlineAsm, + unsigned VariantID = 0) { + // In Code16GCC mode, match as 32-bit. + if (Code16GCC) + SwitchMode(X86::Mode32Bit); + unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo, + matchingInlineAsm, VariantID); + if (Code16GCC) + SwitchMode(X86::Mode16Bit); + return rv; + } + enum InfixCalculatorTok { IC_OR = 0, IC_XOR, @@ -659,20 +673,15 @@ private: } }; - bool Error(SMLoc L, const Twine &Msg, - ArrayRef<SMRange> Ranges = None, + bool Error(SMLoc L, const Twine &Msg, SMRange Range = None, bool MatchingInlineAsm = false) { MCAsmParser &Parser = getParser(); - if (MatchingInlineAsm) return true; - return Parser.Error(L, Msg, Ranges); - } - - bool ErrorAndEatStatement(SMLoc L, const Twine &Msg, - ArrayRef<SMRange> Ranges = None, - bool MatchingInlineAsm = false) { - MCAsmParser &Parser = getParser(); - Parser.eatToEndOfStatement(); - return Error(L, Msg, Ranges, MatchingInlineAsm); + if (MatchingInlineAsm) { + if (!getLexer().isAtStartOfStatement()) + Parser.eatToEndOfStatement(); + return false; + } + return Parser.Error(L, Msg, Range); } std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) { @@ -698,14 +707,11 @@ private: std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind); std::unique_ptr<X86Operand> ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); - std::unique_ptr<X86Operand> - ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size); std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); - std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg, - SMLoc Start, - int64_t ImmDisp, - unsigned Size); + std::unique_ptr<X86Operand> + ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp, + bool isSymbol, unsigned Size); bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End); @@ -716,7 +722,8 @@ private: CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, - InlineAsmIdentifierInfo &Info); + InlineAsmIdentifierInfo &Info, + bool AllowBetterSizeMatch = false); bool parseDirectiveEven(SMLoc L); bool ParseDirectiveWord(unsigned Size, SMLoc L); @@ -753,10 +760,17 @@ private: /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z}) /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required. - /// \return \c true if no parsing errors occurred, \c false otherwise. + /// return false if no parsing errors occurred, true otherwise. bool HandleAVX512Operand(OperandVector &Operands, const MCParsedAsmOperand &Op); + bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc); + + /// MS-compatibility: + /// Obtain an appropriate size qualifier, when facing its absence, + /// upon AVX512 vector/broadcast memory operand + unsigned AdjustAVX512Mem(unsigned Size, X86Operand* UnsizedMemOpNext); + bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? return getSTI().getFeatureBits()[X86::Mode64Bit]; @@ -802,7 +816,8 @@ private: public: X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, const MCInstrInfo &mii, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) { + : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr), + Code16GCC(false) { // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); @@ -833,6 +848,11 @@ static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg, // If we have both a base register and an index register make sure they are // both 64-bit or 32-bit registers. // To support VSIB, IndexReg can be 128-bit or 256-bit registers. + + if ((BaseReg == X86::RIP && IndexReg != 0) || (IndexReg == X86::RIP)) { + ErrMsg = "invalid base+index expression"; + return true; + } if (BaseReg != 0 && IndexReg != 0) { if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) && (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) || @@ -907,8 +927,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, if (RegNo == X86::RIZ || X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) || X86II::isX86_64NonExtLowByteReg(RegNo) || - X86II::isX86_64ExtendedReg(RegNo) || - X86II::is32ExtendedReg(RegNo)) + X86II::isX86_64ExtendedReg(RegNo)) return Error(StartLoc, "register %" + Tok.getString() + " is only available in 64-bit mode", SMRange(StartLoc, EndLoc)); @@ -992,20 +1011,20 @@ void X86AsmParser::SetFrameRegister(unsigned RegNo) { } std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { - unsigned basereg = - is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI); + bool Parse32 = is32BitMode() || Code16GCC; + unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI); const MCExpr *Disp = MCConstantExpr::create(0, getContext()); return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, - /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1, + /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); } std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { - unsigned basereg = - is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI); + bool Parse32 = is32BitMode() || Code16GCC; + unsigned Basereg = is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI); const MCExpr *Disp = MCConstantExpr::create(0, getContext()); return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, - /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1, + /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); } @@ -1159,7 +1178,7 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) { std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, - InlineAsmIdentifierInfo &Info) { + InlineAsmIdentifierInfo &Info, bool AllowBetterSizeMatch) { // If we found a decl other than a VarDecl, then assume it is a FuncDecl or // some other label reference. if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) { @@ -1188,6 +1207,13 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( if (Size) InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, /*Len=*/0, Size); + if (AllowBetterSizeMatch) + // Handle cases where size qualifier is absent, upon an indirect symbol + // reference - e.g. "vaddps zmm1, zmm2, [var]" + // set Size to zero to allow matching mechansim to try and find a better + // size qualifier than our initial guess, based on available variants of + // the given instruction + Size = 0; } } @@ -1271,7 +1297,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an // identifier. Don't try an parse it as a register. - if (Tok.getString().startswith(".")) + if (PrevTK != AsmToken::Error && Tok.getString().startswith(".")) break; // If we're parsing an immediate expression, we don't expect a '['. @@ -1386,7 +1412,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { std::unique_ptr<X86Operand> X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, - int64_t ImmDisp, unsigned Size) { + int64_t ImmDisp, bool isSymbol, + unsigned Size) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); @@ -1436,6 +1463,21 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, Disp = NewDisp; } + if (isSymbol) { + if (SM.getSym()) { + Error(Start, "cannot use more than one symbol in memory operand"); + return nullptr; + } + if (SM.getBaseReg()) { + Error(Start, "cannot use base register with variable reference"); + return nullptr; + } + if (SM.getIndexReg()) { + Error(Start, "cannot use index register with variable reference"); + return nullptr; + } + } + int BaseReg = SM.getBaseReg(); int IndexReg = SM.getIndexReg(); int Scale = SM.getScale(); @@ -1458,7 +1500,8 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start, - End, Size, SM.getSymName(), Info); + End, Size, SM.getSymName(), Info, + isParsingInlineAsm()); } // Inline assembly may use variable names with namespace alias qualifiers. @@ -1541,7 +1584,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, } if (getLexer().is(AsmToken::LBrac)) - return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); + return ParseIntelBracExpression(SegReg, Start, ImmDisp, false, Size); const MCExpr *Val; SMLoc End; @@ -1598,66 +1641,6 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) { } return ErrorOperand(Tok.getLoc(), "unknown token in expression"); } -/// ParseIntelMemOperand - Parse intel style memory operand. -std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, - SMLoc Start, - unsigned Size) { - MCAsmParser &Parser = getParser(); - const AsmToken &Tok = Parser.getTok(); - SMLoc End; - - // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. - if (getLexer().is(AsmToken::LBrac)) - return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size); - assert(ImmDisp == 0); - - const MCExpr *Val; - if (!isParsingInlineAsm()) { - if (getParser().parsePrimaryExpr(Val, End)) - return ErrorOperand(Tok.getLoc(), "unknown token in expression"); - - return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size); - } - - InlineAsmIdentifierInfo Info; - StringRef Identifier = Tok.getString(); - if (ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated=*/false, End)) - return nullptr; - - if (!getLexer().is(AsmToken::LBrac)) - return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0, - /*Scale=*/1, Start, End, Size, Identifier, Info); - - Parser.Lex(); // Eat '[' - - // Parse Identifier [ ImmDisp ] - IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true, - /*AddImmPrefix=*/false); - if (ParseIntelExpression(SM, End)) - return nullptr; - - if (SM.getSym()) { - Error(Start, "cannot use more than one symbol in memory operand"); - return nullptr; - } - if (SM.getBaseReg()) { - Error(Start, "cannot use base register with variable reference"); - return nullptr; - } - if (SM.getIndexReg()) { - Error(Start, "cannot use index register with variable reference"); - return nullptr; - } - - const MCExpr *Disp = MCConstantExpr::create(SM.getImm(), getContext()); - // BaseReg is non-zero to avoid assertions. In the context of inline asm, - // we're pointing to a local variable in memory, so the base register is - // really the frame or stack pointer. - return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, - /*BaseReg=*/1, /*IndexReg=*/0, /*Scale=*/1, - Start, End, Size, Identifier, Info.OpDecl); -} /// Parse the '.' operator. bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, @@ -1725,8 +1708,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { // The offset operator will have an 'r' constraint, thus we need to create // register operand to ensure proper matching. Just pick a GPR based on // the size of a pointer. - unsigned RegNo = - is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX); + bool Parse32 = is32BitMode() || Code16GCC; + unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX); + return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true, OffsetOfLoc, Identifier, Info.OpDecl); } @@ -1804,49 +1788,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { Parser.Lex(); // Eat ptr. PtrInOperand = true; } - Start = Tok.getLoc(); - // Immediate. - if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) || - getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) { - AsmToken StartTok = Tok; - IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, - /*AddImmPrefix=*/false); - if (ParseIntelExpression(SM, End)) - return nullptr; - - int64_t Imm = SM.getImm(); - if (isParsingInlineAsm()) { - unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); - if (StartTok.getString().size() == Len) - // Just add a prefix if this wasn't a complex immediate expression. - InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start); - else - // Otherwise, rewrite the complex expression as a single immediate. - InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm); - } - - if (getLexer().isNot(AsmToken::LBrac)) { - // If a directional label (ie. 1f or 2b) was parsed above from - // ParseIntelExpression() then SM.getSym() was set to a pointer to - // to the MCExpr with the directional local symbol and this is a - // memory operand not an immediate operand. - if (SM.getSym()) - return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End, - Size); - - const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext()); - return X86Operand::CreateImm(ImmExpr, Start, End); - } - - // Only positive immediates are valid. - if (Imm < 0) - return ErrorOperand(Start, "expected a positive immediate displacement " - "before bracketed expr."); - - // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. - return ParseIntelMemOperand(Imm, Start, Size); - } + Start = Tok.getLoc(); // rounding mode token if (getSTI().getFeatureBits()[X86::FeatureAVX512] && @@ -1855,24 +1798,78 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { // Register. unsigned RegNo = 0; - if (!ParseRegister(RegNo, Start, End)) { + if (getLexer().is(AsmToken::Identifier) && + !ParseRegister(RegNo, Start, End)) { // If this is a segment register followed by a ':', then this is the start // of a segment override, otherwise this is a normal register reference. - // In case it is a normal register and there is ptr in the operand this + // In case it is a normal register and there is ptr in the operand this // is an error - if (getLexer().isNot(AsmToken::Colon)){ - if (PtrInOperand){ + if (RegNo == X86::RIP) + return ErrorOperand(Start, "rip can only be used as a base register"); + if (getLexer().isNot(AsmToken::Colon)) { + if (PtrInOperand) { return ErrorOperand(Start, "expected memory operand after " "'ptr', found register operand instead"); } return X86Operand::CreateReg(RegNo, Start, End); } - return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size); } - // Memory operand. - return ParseIntelMemOperand(/*Disp=*/0, Start, Size); + // Immediates and Memory + + // Parse [ BaseReg + Scale*IndexReg + Disp ]. + if (getLexer().is(AsmToken::LBrac)) + return ParseIntelBracExpression(/*SegReg=*/0, Start, /*ImmDisp=*/0, false, + Size); + + AsmToken StartTok = Tok; + IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, + /*AddImmPrefix=*/false); + if (ParseIntelExpression(SM, End)) + return nullptr; + + bool isSymbol = SM.getSym() && SM.getSym()->getKind() != MCExpr::Constant; + int64_t Imm = SM.getImm(); + if (SM.getSym() && SM.getSym()->getKind() == MCExpr::Constant) + SM.getSym()->evaluateAsAbsolute(Imm); + + if (StartTok.isNot(AsmToken::Identifier) && + StartTok.isNot(AsmToken::String) && isParsingInlineAsm()) { + unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); + if (StartTok.getString().size() == Len) + // Just add a prefix if this wasn't a complex immediate expression. + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start); + else + // Otherwise, rewrite the complex expression as a single immediate. + InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm); + } + + if (getLexer().isNot(AsmToken::LBrac)) { + // If a directional label (ie. 1f or 2b) was parsed above from + // ParseIntelExpression() then SM.getSym() was set to a pointer to + // to the MCExpr with the directional local symbol and this is a + // memory operand not an immediate operand. + if (isSymbol) { + if (isParsingInlineAsm()) + return CreateMemForInlineAsm(/*SegReg=*/0, SM.getSym(), /*BaseReg=*/0, + /*IndexReg=*/0, + /*Scale=*/1, Start, End, Size, + SM.getSymName(), SM.getIdentifierInfo()); + return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End, + Size); + } + + const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext()); + return X86Operand::CreateImm(ImmExpr, Start, End); + } + + // Only positive immediates are valid. + if (Imm < 0) + return ErrorOperand(Start, "expected a positive immediate displacement " + "before bracketed expr."); + + return ParseIntelBracExpression(/*SegReg=*/0, Start, Imm, isSymbol, Size); } std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { @@ -1891,6 +1888,11 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { SMRange(Start, End)); return nullptr; } + if (RegNo == X86::RIP) { + Error(Start, "%rip can only be used as a base register", + SMRange(Start, End)); + return nullptr; + } // If this is a segment register followed by a ':', then this is the start // of a memory reference, otherwise this is a normal register reference. @@ -1916,11 +1918,33 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { SMLoc Start = Parser.getTok().getLoc(), End; if (getSTI().getFeatureBits()[X86::FeatureAVX512]) return ParseRoundingModeOp(Start, End); - return ErrorOperand(Start, "unknown token in expression"); + return ErrorOperand(Start, "Unexpected '{' in expression"); } } } +// true on failure, false otherwise +// If no {z} mark was found - Parser doesn't advance +bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z, + const SMLoc &StartLoc) { + MCAsmParser &Parser = getParser(); + // Assuming we are just pass the '{' mark, quering the next token + // Searched for {z}, but none was found. Return false, as no parsing error was + // encountered + if (!(getLexer().is(AsmToken::Identifier) && + (getLexer().getTok().getIdentifier() == "z"))) + return false; + Parser.Lex(); // Eat z + // Query and eat the '}' mark + if (!getLexer().is(AsmToken::RCurly)) + return Error(getLexer().getLoc(), "Expected } at this point"); + Parser.Lex(); // Eat '}' + // Assign Z with the {z} mark opernad + Z = X86Operand::CreateToken("{z}", StartLoc); + return false; +} + +// true on failure, false otherwise bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, const MCParsedAsmOperand &Op) { MCAsmParser &Parser = getParser(); @@ -1932,13 +1956,11 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, if(getLexer().is(AsmToken::Integer)) { // Parse memory broadcasting ({1to<NUM>}). if (getLexer().getTok().getIntVal() != 1) - return !ErrorAndEatStatement(getLexer().getLoc(), - "Expected 1to<NUM> at this point"); + return TokError("Expected 1to<NUM> at this point"); Parser.Lex(); // Eat "1" of 1to8 if (!getLexer().is(AsmToken::Identifier) || !getLexer().getTok().getIdentifier().startswith("to")) - return !ErrorAndEatStatement(getLexer().getLoc(), - "Expected 1to<NUM> at this point"); + return TokError("Expected 1to<NUM> at this point"); // Recognize only reasonable suffixes. const char *BroadcastPrimitive = StringSwitch<const char*>(getLexer().getTok().getIdentifier()) @@ -1948,46 +1970,57 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, .Case("to16", "{1to16}") .Default(nullptr); if (!BroadcastPrimitive) - return !ErrorAndEatStatement(getLexer().getLoc(), - "Invalid memory broadcast primitive."); + return TokError("Invalid memory broadcast primitive."); Parser.Lex(); // Eat "toN" of 1toN if (!getLexer().is(AsmToken::RCurly)) - return !ErrorAndEatStatement(getLexer().getLoc(), - "Expected } at this point"); + return TokError("Expected } at this point"); Parser.Lex(); // Eat "}" Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive, consumedToken)); // No AVX512 specific primitives can pass // after memory broadcasting, so return. - return true; + return false; } else { - // Parse mask register {%k1} - Operands.push_back(X86Operand::CreateToken("{", consumedToken)); - if (std::unique_ptr<X86Operand> Op = ParseOperand()) { - Operands.push_back(std::move(Op)); - if (!getLexer().is(AsmToken::RCurly)) - return !ErrorAndEatStatement(getLexer().getLoc(), - "Expected } at this point"); - Operands.push_back(X86Operand::CreateToken("}", consumeToken())); - - // Parse "zeroing non-masked" semantic {z} - if (getLexer().is(AsmToken::LCurly)) { - Operands.push_back(X86Operand::CreateToken("{z}", consumeToken())); - if (!getLexer().is(AsmToken::Identifier) || - getLexer().getTok().getIdentifier() != "z") - return !ErrorAndEatStatement(getLexer().getLoc(), - "Expected z at this point"); - Parser.Lex(); // Eat the z + // Parse either {k}{z}, {z}{k}, {k} or {z} + // last one have no meaning, but GCC accepts it + // Currently, we're just pass a '{' mark + std::unique_ptr<X86Operand> Z; + if (ParseZ(Z, consumedToken)) + return true; + // Reaching here means that parsing of the allegadly '{z}' mark yielded + // no errors. + // Query for the need of further parsing for a {%k<NUM>} mark + if (!Z || getLexer().is(AsmToken::LCurly)) { + const SMLoc StartLoc = Z ? consumeToken() : consumedToken; + // Parse an op-mask register mark ({%k<NUM>}), which is now to be + // expected + if (std::unique_ptr<X86Operand> Op = ParseOperand()) { if (!getLexer().is(AsmToken::RCurly)) - return !ErrorAndEatStatement(getLexer().getLoc(), - "Expected } at this point"); - Parser.Lex(); // Eat the } + return Error(getLexer().getLoc(), "Expected } at this point"); + Operands.push_back(X86Operand::CreateToken("{", StartLoc)); + Operands.push_back(std::move(Op)); + Operands.push_back(X86Operand::CreateToken("}", consumeToken())); + } else + return Error(getLexer().getLoc(), + "Expected an op-mask register at this point"); + // {%k<NUM>} mark is found, inquire for {z} + if (getLexer().is(AsmToken::LCurly) && !Z) { + // Have we've found a parsing error, or found no (expected) {z} mark + // - report an error + if (ParseZ(Z, consumeToken()) || !Z) + return true; + } + // '{z}' on its own is meaningless, hence should be ignored. + // on the contrary - have it been accompanied by a K register, + // allow it. + if (Z) + Operands.push_back(std::move(Z)); } } } } - return true; + return false; } /// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix @@ -2077,7 +2110,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this. if (getLexer().is(AsmToken::Percent)) { SMLoc L; - if (ParseRegister(IndexReg, L, L)) return nullptr; + if (ParseRegister(IndexReg, L, L)) + return nullptr; + if (BaseReg == X86::RIP) { + Error(IndexLoc, "%rip as base register can not have an index register"); + return nullptr; + } + if (IndexReg == X86::RIP) { + Error(IndexLoc, "%rip is not allowed as an index register"); + return nullptr; + } if (getLexer().isNot(AsmToken::RParen)) { // Parse the scale amount: @@ -2169,6 +2211,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, InstInfo = &Info; StringRef PatchedName = Name; + if (Name == "jmp" && isParsingIntelSyntax() && isParsingInlineAsm()) { + StringRef NextTok = Parser.getTok().getString(); + if (NextTok == "short") { + SMLoc NameEndLoc = + NameLoc.getFromPointer(NameLoc.getPointer() + Name.size()); + // Eat the short keyword + Parser.Lex(); + // MS ignores the short keyword, it determines the jmp type based + // on the distance of the label + InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc, + NextTok.size() + 1); + } + } + // FIXME: Hack to recognize setneb as setne. if (PatchedName.startswith("set") && PatchedName.endswith("b") && PatchedName != "setb" && PatchedName != "setnb") @@ -2321,10 +2377,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, while(1) { if (std::unique_ptr<X86Operand> Op = ParseOperand()) { Operands.push_back(std::move(Op)); - if (!HandleAVX512Operand(Operands, *Operands.back())) + if (HandleAVX512Operand(Operands, *Operands.back())) return true; } else { - Parser.eatToEndOfStatement(); return true; } // check for comma and eat it @@ -2340,8 +2395,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, isParsingIntelSyntax() && isParsingInlineAsm() && (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly)); if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement) - return ErrorAndEatStatement(getLexer().getLoc(), - "unexpected token in argument list"); + return TokError("unexpected token in argument list"); } // Consume the EndOfStatement or the prefix separator Slash @@ -2367,6 +2421,30 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl); } + // Moving a 32 or 16 bit value into a segment register has the same + // behavior. Modify such instructions to always take shorter form. + if ((Name == "mov" || Name == "movw" || Name == "movl") && + (Operands.size() == 3)) { + X86Operand &Op1 = (X86Operand &)*Operands[1]; + X86Operand &Op2 = (X86Operand &)*Operands[2]; + SMLoc Loc = Op1.getEndLoc(); + if (Op1.isReg() && Op2.isReg() && + X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains( + Op2.getReg()) && + (X86MCRegisterClasses[X86::GR16RegClassID].contains(Op1.getReg()) || + X86MCRegisterClasses[X86::GR32RegClassID].contains(Op1.getReg()))) { + // Change instruction name to match new instruction. + if (Name != "mov" && Name[3] == (is16BitMode() ? 'l' : 'w')) { + Name = is16BitMode() ? "movw" : "movl"; + Operands[0] = X86Operand::CreateToken(Name, NameLoc); + } + // Select the correct equivalent 16-/32-bit source register. + unsigned Reg = + getX86SubSuperRegisterOrZero(Op1.getReg(), is16BitMode() ? 16 : 32); + Operands[1] = X86Operand::CreateReg(Reg, Loc, Loc); + } + } + // This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" -> // "outb %al, %dx". Out doesn't take a memory form, but this is a widely // documented form in various unofficial manuals, so a lot of code uses it. @@ -2472,7 +2550,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, (Name == "smov" || Name == "smovb" || Name == "smovw" || Name == "smovl" || Name == "smovd" || Name == "smovq"))) && (Operands.size() == 1 || Operands.size() == 3)) { - if (Name == "movsd" && Operands.size() == 1) + if (Name == "movsd" && Operands.size() == 1 && !isParsingIntelSyntax()) Operands.back() = X86Operand::CreateToken("movsl", NameLoc); AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc), DefaultMemDIOperand(NameLoc)); @@ -2583,7 +2661,6 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, bool MatchingInlineAsm) { assert(ErrorInfo && "Unknown missing feature!"); - ArrayRef<SMRange> EmptyRanges = None; SmallString<126> Msg; raw_svector_ostream OS(Msg); OS << "instruction requires:"; @@ -2593,7 +2670,7 @@ bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask); Mask <<= 1; } - return Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm); + return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm); } bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -2604,7 +2681,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, assert(!Operands.empty() && "Unexpect empty operand list!"); X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); assert(Op.isToken() && "Leading operand should always be a mnemonic!"); - ArrayRef<SMRange> EmptyRanges = None; + SMRange EmptyRange = None; // First, handle aliases that expand to multiple instructions. MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); @@ -2613,9 +2690,8 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst Inst; // First, try a direct match. - switch (MatchInstructionImpl(Operands, Inst, - ErrorInfo, MatchingInlineAsm, - isParsingIntelSyntax())) { + switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm, + isParsingIntelSyntax())) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: // Some instructions need post-processing to, for example, tweak which @@ -2666,8 +2742,8 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) { Tmp.back() = Suffixes[I]; - Match[I] = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); + Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore, + MatchingInlineAsm, isParsingIntelSyntax()); // If this returned as a missing feature failure, remember that. if (Match[I] == Match_MissingFeature) ErrorInfoMissingFeature = ErrorInfoIgnore; @@ -2711,7 +2787,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, OS << "'" << Base << MatchChars[i] << "'"; } OS << ")"; - Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm); + Error(IDLoc, OS.str(), EmptyRange, MatchingInlineAsm); return true; } @@ -2721,17 +2797,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // mnemonic was invalid. if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) { if (!WasOriginallyInvalidOperand) { - ArrayRef<SMRange> Ranges = - MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", - Ranges, MatchingInlineAsm); + Op.getLocRange(), MatchingInlineAsm); } // Recover location info for the operand if we know which was the problem. if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) - return Error(IDLoc, "too few operands for instruction", - EmptyRanges, MatchingInlineAsm); + return Error(IDLoc, "too few operands for instruction", EmptyRange, + MatchingInlineAsm); X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo]; if (Operand.getStartLoc().isValid()) { @@ -2741,7 +2815,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, } } - return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + return Error(IDLoc, "invalid operand for instruction", EmptyRange, MatchingInlineAsm); } @@ -2758,16 +2832,33 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // operand failure. if (std::count(std::begin(Match), std::end(Match), Match_InvalidOperand) == 1) { - return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + return Error(IDLoc, "invalid operand for instruction", EmptyRange, MatchingInlineAsm); } // If all of these were an outright failure, report it in a useless way. Error(IDLoc, "unknown use of instruction mnemonic without a size suffix", - EmptyRanges, MatchingInlineAsm); + EmptyRange, MatchingInlineAsm); return true; } +unsigned X86AsmParser::AdjustAVX512Mem(unsigned Size, + X86Operand* UnsizedMemOpNext) { + // Check for the existence of an AVX512 platform + if (!getSTI().getFeatureBits()[X86::FeatureAVX512]) + return 0; + // Allow adjusting upon a (x|y|z)mm + if (Size == 512 || Size == 256 || Size == 128) + return Size; + // This is an allegadly broadcasting mem op adjustment, + // allow some more inquiring to validate it + if (Size == 64 || Size == 32) + return UnsizedMemOpNext && UnsizedMemOpNext->isToken() && + UnsizedMemOpNext->getToken().substr(0, 4).equals("{1to") ? Size : 0; + // Do not allow any other type of adjustments + return 0; +} + bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -2777,7 +2868,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); assert(Op.isToken() && "Leading operand should always be a mnemonic!"); StringRef Mnemonic = Op.getToken(); - ArrayRef<SMRange> EmptyRanges = None; + SMRange EmptyRange = None; + StringRef Base = Op.getToken(); // First, handle aliases that expand to multiple instructions. MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); @@ -2786,8 +2878,17 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // Find one unsized memory operand, if present. X86Operand *UnsizedMemOp = nullptr; + // If unsized memory operand was found - obtain following operand. + // For use in AdjustAVX512Mem + X86Operand *UnsizedMemOpNext = nullptr; for (const auto &Op : Operands) { X86Operand *X86Op = static_cast<X86Operand *>(Op.get()); + if (UnsizedMemOp) { + UnsizedMemOpNext = X86Op; + // Have we found an unqualified memory operand, + // break. IA allows only one memory operand. + break; + } if (X86Op->isMemUnsized()) UnsizedMemOp = X86Op; } @@ -2804,26 +2905,58 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, } } + SmallVector<unsigned, 8> Match; + uint64_t ErrorInfoMissingFeature = 0; + + // If unsized push has immediate operand we should default the default pointer + // size for the size. + if (Mnemonic == "push" && Operands.size() == 2) { + auto *X86Op = static_cast<X86Operand *>(Operands[1].get()); + if (X86Op->isImm()) { + // If it's not a constant fall through and let remainder take care of it. + const auto *CE = dyn_cast<MCConstantExpr>(X86Op->getImm()); + unsigned Size = getPointerWidth(); + if (CE && + (isIntN(Size, CE->getValue()) || isUIntN(Size, CE->getValue()))) { + SmallString<16> Tmp; + Tmp += Base; + Tmp += (is64BitMode()) + ? "q" + : (is32BitMode()) ? "l" : (is16BitMode()) ? "w" : " "; + Op.setTokenValue(Tmp); + // Do match in ATT mode to allow explicit suffix usage. + Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo, + MatchingInlineAsm, + false /*isParsingIntelSyntax()*/)); + Op.setTokenValue(Base); + } + } + } + // If an unsized memory operand is present, try to match with each memory // operand size. In Intel assembly, the size is not part of the instruction // mnemonic. - SmallVector<unsigned, 8> Match; - uint64_t ErrorInfoMissingFeature = 0; + unsigned MatchedSize = 0; if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) { static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512}; for (unsigned Size : MopSizes) { UnsizedMemOp->Mem.Size = Size; uint64_t ErrorInfoIgnore; unsigned LastOpcode = Inst.getOpcode(); - unsigned M = - MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); + unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore, + MatchingInlineAsm, isParsingIntelSyntax()); if (Match.empty() || LastOpcode != Inst.getOpcode()) Match.push_back(M); // If this returned as a missing feature failure, remember that. if (Match.back() == Match_MissingFeature) ErrorInfoMissingFeature = ErrorInfoIgnore; + if (M == Match_Success) + // MS-compatability: + // Adjust AVX512 vector/broadcast memory operand, + // when facing the absence of a size qualifier. + // Match GCC behavior on respective cases. + MatchedSize = AdjustAVX512Mem(Size, UnsizedMemOpNext); } // Restore the size of the unsized memory operand if we modified it. @@ -2835,9 +2968,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // operation. There shouldn't be any ambiguity in our mnemonic table, so try // matching with the unsized operand. if (Match.empty()) { - Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo, - MatchingInlineAsm, - isParsingIntelSyntax())); + Match.push_back(MatchInstruction( + Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax())); // If this returned as a missing feature failure, remember that. if (Match.back() == Match_MissingFeature) ErrorInfoMissingFeature = ErrorInfo; @@ -2849,10 +2981,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // If it's a bad mnemonic, all results will be the same. if (Match.back() == Match_MnemonicFail) { - ArrayRef<SMRange> Ranges = - MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'", - Ranges, MatchingInlineAsm); + Op.getLocRange(), MatchingInlineAsm); } // If exactly one matched, then we treat that as a successful match (and the @@ -2861,6 +2991,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, unsigned NumSuccessfulMatches = std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { + if (MatchedSize && isParsingInlineAsm() && isParsingIntelSyntax()) + // MS compatibility - + // Fix the rewrite according to the matched memory size + // MS inline assembly only + for (AsmRewrite &AR : *InstInfo->AsmRewrites) + if ((AR.Loc.getPointer() == UnsizedMemOp->StartLoc.getPointer()) && + (AR.Kind == AOK_SizeDirective)) + AR.Val = MatchedSize; // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the individual // transformations can chain off each other. @@ -2875,11 +3013,9 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, } else if (NumSuccessfulMatches > 1) { assert(UnsizedMemOp && "multiple matches only possible with unsized memory operands"); - ArrayRef<SMRange> Ranges = - MatchingInlineAsm ? EmptyRanges : UnsizedMemOp->getLocRange(); return Error(UnsizedMemOp->getStartLoc(), "ambiguous operand size for instruction '" + Mnemonic + "\'", - Ranges, MatchingInlineAsm); + UnsizedMemOp->getLocRange(), MatchingInlineAsm); } // If one instruction matched with a missing feature, report this as a @@ -2895,12 +3031,12 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // operand failure. if (std::count(std::begin(Match), std::end(Match), Match_InvalidOperand) == 1) { - return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + return Error(IDLoc, "invalid operand for instruction", EmptyRange, MatchingInlineAsm); } // If all of these were an outright failure, report it in a useless way. - return Error(IDLoc, "unknown instruction mnemonic", EmptyRanges, + return Error(IDLoc, "unknown instruction mnemonic", EmptyRange, MatchingInlineAsm); } @@ -2945,14 +3081,14 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { /// parseDirectiveEven /// ::= .even bool X86AsmParser::parseDirectiveEven(SMLoc L) { - const MCSection *Section = getStreamer().getCurrentSection().first; if (getLexer().isNot(AsmToken::EndOfStatement)) { TokError("unexpected token in directive"); return false; } + const MCSection *Section = getStreamer().getCurrentSectionOnly(); if (!Section) { getStreamer().InitSections(false); - Section = getStreamer().getCurrentSection().first; + Section = getStreamer().getCurrentSectionOnly(); } if (Section->UseCodeAlign()) getStreamer().EmitCodeAlignment(2, 0); @@ -3001,12 +3137,21 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { /// ::= .code16 | .code32 | .code64 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { MCAsmParser &Parser = getParser(); + Code16GCC = false; if (IDVal == ".code16") { Parser.Lex(); if (!is16BitMode()) { SwitchMode(X86::Mode16Bit); getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); } + } else if (IDVal == ".code16gcc") { + // .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode. + Parser.Lex(); + Code16GCC = true; + if (!is16BitMode()) { + SwitchMode(X86::Mode16Bit); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + } } else if (IDVal == ".code32") { Parser.Lex(); if (!is32BitMode()) { @@ -3029,8 +3174,8 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { // Force static initialization. extern "C" void LLVMInitializeX86AsmParser() { - RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target); - RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target); + RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target()); + RegisterMCAsmParser<X86AsmParser> Y(getTheX86_64Target()); } #define GET_REGISTER_MATCHER diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h index a04c2f5..9db1a84 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -192,8 +192,10 @@ struct X86Operand : public MCParsedAsmOperand { bool isImmUnsignedi8() const { if (!isImm()) return false; + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - if (!CE) return false; + if (!CE) return true; return isImmUnsignedi8Value(CE->getValue()); } diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 008dead..0871888 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -96,7 +96,7 @@ void llvm::X86Disassembler::Debug(const char *file, unsigned line, dbgs() << file << ":" << line << ": " << s; } -const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode, +StringRef llvm::X86Disassembler::GetInstrName(unsigned Opcode, const void *mii) { const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii); return MII->getName(Opcode); @@ -470,10 +470,20 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break; - case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; - case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; - case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; - case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; + case X86::VCMPPDZ128rmi: NewOpc = X86::VCMPPDZ128rmi_alt; break; + case X86::VCMPPDZ128rri: NewOpc = X86::VCMPPDZ128rri_alt; break; + case X86::VCMPPSZ128rmi: NewOpc = X86::VCMPPSZ128rmi_alt; break; + case X86::VCMPPSZ128rri: NewOpc = X86::VCMPPSZ128rri_alt; break; + case X86::VCMPPDZ256rmi: NewOpc = X86::VCMPPDZ256rmi_alt; break; + case X86::VCMPPDZ256rri: NewOpc = X86::VCMPPDZ256rri_alt; break; + case X86::VCMPPSZ256rmi: NewOpc = X86::VCMPPSZ256rmi_alt; break; + case X86::VCMPPSZ256rri: NewOpc = X86::VCMPPSZ256rri_alt; break; + case X86::VCMPSDZrm_Int: NewOpc = X86::VCMPSDZrmi_alt; break; + case X86::VCMPSDZrr_Int: NewOpc = X86::VCMPSDZrri_alt; break; + case X86::VCMPSDZrrb_Int: NewOpc = X86::VCMPSDZrrb_alt; break; + case X86::VCMPSSZrm_Int: NewOpc = X86::VCMPSSZrmi_alt; break; + case X86::VCMPSSZrr_Int: NewOpc = X86::VCMPSSZrri_alt; break; + case X86::VCMPSSZrrb_Int: NewOpc = X86::VCMPSSZrrb_alt; break; } // Switch opcode to the one that doesn't get special printing. mcInst.setOpcode(NewOpc); @@ -1066,8 +1076,8 @@ static MCDisassembler *createX86Disassembler(const Target &T, extern "C" void LLVMInitializeX86Disassembler() { // Register the disassembler. - TargetRegistry::RegisterMCDisassembler(TheX86_32Target, + TargetRegistry::RegisterMCDisassembler(getTheX86_32Target(), createX86Disassembler); - TargetRegistry::RegisterMCDisassembler(TheX86_64Target, + TargetRegistry::RegisterMCDisassembler(getTheX86_64Target(), createX86Disassembler); } diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index b0a150a..ab64d6f 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -825,7 +825,7 @@ static int getIDWithAttrMask(uint16_t* instructionID, * @param orig - The instruction that is not 16-bit * @param equiv - The instruction that is 16-bit */ -static bool is16BitEquivalent(const char* orig, const char* equiv) { +static bool is16BitEquivalent(const char *orig, const char *equiv) { off_t i; for (i = 0;; i++) { @@ -850,7 +850,7 @@ static bool is16BitEquivalent(const char* orig, const char* equiv) { * * @param name - The instruction that is not 16-bit */ -static bool is64Bit(const char* name) { +static bool is64Bit(const char *name) { off_t i; for (i = 0;; ++i) { @@ -1044,9 +1044,9 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { return 0; } - const char *SpecName = GetInstrName(instructionIDWithREXW, miiArg); + auto SpecName = GetInstrName(instructionIDWithREXW, miiArg); // If not a 64-bit instruction. Switch the opcode. - if (!is64Bit(SpecName)) { + if (!is64Bit(SpecName.data())) { insn->instructionID = instructionIDWithREXW; insn->spec = specifierForUID(instructionIDWithREXW); return 0; @@ -1092,7 +1092,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { const struct InstructionSpecifier *spec; uint16_t instructionIDWithOpsize; - const char *specName, *specWithOpSizeName; + llvm::StringRef specName, specWithOpSizeName; spec = specifierForUID(instructionID); @@ -1112,7 +1112,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { specName = GetInstrName(instructionID, miiArg); specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg); - if (is16BitEquivalent(specName, specWithOpSizeName) && + if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) && (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) { insn->instructionID = instructionIDWithOpsize; insn->spec = specifierForUID(instructionIDWithOpsize); diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 24d24a2..b07fd0b 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -674,7 +674,7 @@ int decodeInstruction(InternalInstruction *insn, /// \param s The message to print. void Debug(const char *file, unsigned line, const char *s); -const char *GetInstrName(unsigned Opcode, const void *mii); +StringRef GetInstrName(unsigned Opcode, const void *mii); } // namespace X86Disassembler } // namespace llvm diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 3a5d056..10b7e6f 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -291,6 +291,9 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O) { + if (MI->getOperand(Op).isExpr()) + return printOperand(MI, Op, O); + O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff) << markup(">"); } diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp index f537956..8594add 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -255,6 +255,10 @@ static std::string getMaskName(const MCInst *MI, const char *DestName, CASE_MASKZ_UNPCK(UNPCKLPS, r) CASE_MASKZ_SHUF(PALIGNR, r) CASE_MASKZ_SHUF(PALIGNR, m) + CASE_MASKZ_SHUF(ALIGNQ, r) + CASE_MASKZ_SHUF(ALIGNQ, m) + CASE_MASKZ_SHUF(ALIGND, r) + CASE_MASKZ_SHUF(ALIGND, m) CASE_MASKZ_SHUF(SHUFPD, m) CASE_MASKZ_SHUF(SHUFPD, r) CASE_MASKZ_SHUF(SHUFPS, m) @@ -277,6 +281,26 @@ static std::string getMaskName(const MCInst *MI, const char *DestName, CASE_MASKZ_VSHUF(64X2, r) CASE_MASKZ_VSHUF(32X4, m) CASE_MASKZ_VSHUF(32X4, r) + CASE_MASKZ_INS_COMMON(BROADCASTF64X2, Z128, rm) + CASE_MASKZ_INS_COMMON(BROADCASTI64X2, Z128, rm) + CASE_MASKZ_INS_COMMON(BROADCASTF64X2, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTI64X2, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTF64X4, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTI64X4, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTF32X4, Z256, rm) + CASE_MASKZ_INS_COMMON(BROADCASTI32X4, Z256, rm) + CASE_MASKZ_INS_COMMON(BROADCASTF32X4, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTI32X4, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTF32X8, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTI32X8, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, r) + CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, r) + CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, m) + CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, m) + CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, r) + CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, r) + CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, m) + CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, m) MaskWithZero = true; MaskRegName = getRegName(MI->getOperand(1).getReg()); break; @@ -320,6 +344,10 @@ static std::string getMaskName(const MCInst *MI, const char *DestName, CASE_MASK_UNPCK(UNPCKLPS, r) CASE_MASK_SHUF(PALIGNR, r) CASE_MASK_SHUF(PALIGNR, m) + CASE_MASK_SHUF(ALIGNQ, r) + CASE_MASK_SHUF(ALIGNQ, m) + CASE_MASK_SHUF(ALIGND, r) + CASE_MASK_SHUF(ALIGND, m) CASE_MASK_SHUF(SHUFPD, m) CASE_MASK_SHUF(SHUFPD, r) CASE_MASK_SHUF(SHUFPS, m) @@ -342,6 +370,26 @@ static std::string getMaskName(const MCInst *MI, const char *DestName, CASE_MASK_VSHUF(64X2, r) CASE_MASK_VSHUF(32X4, m) CASE_MASK_VSHUF(32X4, r) + CASE_MASK_INS_COMMON(BROADCASTF64X2, Z128, rm) + CASE_MASK_INS_COMMON(BROADCASTI64X2, Z128, rm) + CASE_MASK_INS_COMMON(BROADCASTF64X2, , rm) + CASE_MASK_INS_COMMON(BROADCASTI64X2, , rm) + CASE_MASK_INS_COMMON(BROADCASTF64X4, , rm) + CASE_MASK_INS_COMMON(BROADCASTI64X4, , rm) + CASE_MASK_INS_COMMON(BROADCASTF32X4, Z256, rm) + CASE_MASK_INS_COMMON(BROADCASTI32X4, Z256, rm) + CASE_MASK_INS_COMMON(BROADCASTF32X4, , rm) + CASE_MASK_INS_COMMON(BROADCASTI32X4, , rm) + CASE_MASK_INS_COMMON(BROADCASTF32X8, , rm) + CASE_MASK_INS_COMMON(BROADCASTI32X8, , rm) + CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, r) + CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, r) + CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, m) + CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, m) + CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, r) + CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, r) + CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, m) + CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, m) MaskRegName = getRegName(MI->getOperand(2).getReg()); break; } @@ -382,7 +430,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VBLENDPDrri: case X86::VBLENDPDYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; case X86::BLENDPDrmi: case X86::VBLENDPDrmi: case X86::VBLENDPDYrmi: @@ -398,7 +446,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VBLENDPSrri: case X86::VBLENDPSYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; case X86::BLENDPSrmi: case X86::VBLENDPSrmi: case X86::VBLENDPSYrmi: @@ -414,7 +462,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPBLENDWrri: case X86::VPBLENDWYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; case X86::PBLENDWrmi: case X86::VPBLENDWrmi: case X86::VPBLENDWYrmi: @@ -429,7 +477,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPBLENDDrri: case X86::VPBLENDDYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; case X86::VPBLENDDrmi: case X86::VPBLENDDYrmi: if (MI->getOperand(NumOperands - 1).isImm()) @@ -442,12 +490,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::INSERTPSrr: case X86::VINSERTPSrr: - case X86::VINSERTPSzrr: + case X86::VINSERTPSZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; case X86::INSERTPSrm: case X86::VINSERTPSrm: - case X86::VINSERTPSzrm: + case X86::VINSERTPSZrm: DestName = getRegName(MI->getOperand(0).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) @@ -507,7 +555,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_MOVDUP(MOVSLDUP, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_MOVDUP(MOVSLDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); @@ -515,7 +564,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_MOVDUP(MOVSHDUP, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_MOVDUP(MOVSHDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); @@ -523,7 +573,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_MOVDUP(MOVDDUP, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_MOVDUP(MOVDDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); @@ -566,7 +617,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_SHUF(PALIGNR, rri) Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_SHUF(PALIGNR, rmi) Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -576,9 +628,46 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, ShuffleMask); break; + CASE_AVX512_INS_COMMON(ALIGNQ, Z, rri) + CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rri) + CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rri) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_AVX512_INS_COMMON(ALIGNQ, Z, rmi) + CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rmi) + CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rmi) + Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i64, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + + CASE_AVX512_INS_COMMON(ALIGND, Z, rri) + CASE_AVX512_INS_COMMON(ALIGND, Z256, rri) + CASE_AVX512_INS_COMMON(ALIGND, Z128, rri) + Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); + RegForm = true; + LLVM_FALLTHROUGH; + + CASE_AVX512_INS_COMMON(ALIGND, Z, rmi) + CASE_AVX512_INS_COMMON(ALIGND, Z256, rmi) + CASE_AVX512_INS_COMMON(ALIGND, Z128, rmi) + Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(NumOperands - 1).isImm()) + DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i32, 0), + MI->getOperand(NumOperands - 1).getImm(), + ShuffleMask); + break; + CASE_SHUF(PSHUFD, ri) Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_SHUF(PSHUFD, mi) DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) @@ -589,7 +678,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_SHUF(PSHUFHW, ri) Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_SHUF(PSHUFHW, mi) DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) @@ -600,7 +690,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_SHUF(PSHUFLW, ri) Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_SHUF(PSHUFLW, mi) DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) @@ -611,7 +702,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MMX_PSHUFWri: Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + case X86::MMX_PSHUFWmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) @@ -622,7 +714,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PSWAPDrr: Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + case X86::PSWAPDrm: DestName = getRegName(MI->getOperand(0).getReg()); DecodePSWAPMask(MVT::v2i32, ShuffleMask); @@ -632,7 +725,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MMX_PUNPCKHBWirr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(PUNPCKHBW, m) case X86::MMX_PUNPCKHBWirm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -644,7 +738,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MMX_PUNPCKHWDirr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(PUNPCKHWD, m) case X86::MMX_PUNPCKHWDirm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -656,7 +751,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MMX_PUNPCKHDQirr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(PUNPCKHDQ, m) case X86::MMX_PUNPCKHDQirm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -667,7 +763,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_UNPCK(PUNPCKHQDQ, r) Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(PUNPCKHQDQ, m) Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -678,7 +775,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MMX_PUNPCKLBWirr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(PUNPCKLBW, m) case X86::MMX_PUNPCKLBWirm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -690,7 +788,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MMX_PUNPCKLWDirr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(PUNPCKLWD, m) case X86::MMX_PUNPCKLWDirm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -702,7 +801,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MMX_PUNPCKLDQirr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(PUNPCKLDQ, m) case X86::MMX_PUNPCKLDQirm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -713,7 +813,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_UNPCK(PUNPCKLQDQ, r) Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(PUNPCKLQDQ, m) Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -723,7 +824,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_SHUF(SHUFPD, rri) Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_SHUF(SHUFPD, rmi) if (MI->getOperand(NumOperands - 1).isImm()) DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0), @@ -736,7 +838,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_SHUF(SHUFPS, rri) Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_SHUF(SHUFPS, rmi) if (MI->getOperand(NumOperands - 1).isImm()) DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0), @@ -749,7 +852,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_VSHUF(64X2, r) Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_VSHUF(64X2, m) decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i64, 0), MI->getOperand(NumOperands - 1).getImm(), @@ -761,7 +865,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_VSHUF(32X4, r) Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_VSHUF(32X4, m) decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i32, 0), MI->getOperand(NumOperands - 1).getImm(), @@ -773,7 +878,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_UNPCK(UNPCKLPD, r) Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(UNPCKLPD, m) DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -783,7 +889,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_UNPCK(UNPCKLPS, r) Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(UNPCKLPS, m) DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -793,7 +900,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_UNPCK(UNPCKHPD, r) Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(UNPCKHPD, m) DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -803,7 +911,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_UNPCK(UNPCKHPS, r) Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_UNPCK(UNPCKHPS, m) DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); @@ -812,7 +921,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_VPERMILPI(PERMILPS, r) Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_VPERMILPI(PERMILPS, m) if (MI->getOperand(NumOperands - 1).isImm()) DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0), @@ -823,7 +933,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_VPERMILPI(PERMILPD, r) Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_VPERMILPI(PERMILPD, m) if (MI->getOperand(NumOperands - 1).isImm()) DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0), @@ -835,7 +946,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPERM2F128rr: case X86::VPERM2I128rr: Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + case X86::VPERM2F128rm: case X86::VPERM2I128rm: // For instruction comments purpose, assume the 256-bit vector is v4i64. @@ -849,7 +961,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_VPERM(PERMPD, r) Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_VPERM(PERMPD, m) if (MI->getOperand(NumOperands - 1).isImm()) DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::f64, 0), @@ -860,7 +973,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_VPERM(PERMQ, r) Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_VPERM(PERMQ, m) if (MI->getOperand(NumOperands - 1).isImm()) DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::i64, 0), @@ -874,7 +988,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VMOVSDZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + case X86::MOVSDrm: case X86::VMOVSDrm: case X86::VMOVSDZrm: @@ -887,7 +1002,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VMOVSSZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + case X86::MOVSSrm: case X86::VMOVSSrm: case X86::VMOVSSZrm: @@ -901,15 +1017,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VMOVZPQILo2PQIrr: case X86::VMOVZPQILo2PQIZrr: Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + case X86::MOVQI2PQIrm: - case X86::MOVZQI2PQIrm: - case X86::MOVZPQILo2PQIrm: case X86::VMOVQI2PQIrm: case X86::VMOVQI2PQIZrm: - case X86::VMOVZQI2PQIrm: - case X86::VMOVZPQILo2PQIrm: - case X86::VMOVZPQILo2PQIZrm: DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -946,15 +1058,59 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VBROADCASTF128: case X86::VBROADCASTI128: + CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm) DecodeSubVectorBroadcast(MVT::v4f64, MVT::v2f64, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm) + DecodeSubVectorBroadcast(MVT::v8f64, MVT::v2f64, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm) + CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm) + DecodeSubVectorBroadcast(MVT::v8f64, MVT::v4f64, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm) + DecodeSubVectorBroadcast(MVT::v8f32, MVT::v4f32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm) + DecodeSubVectorBroadcast(MVT::v16f32, MVT::v4f32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm) + DecodeSubVectorBroadcast(MVT::v16f32, MVT::v8f32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m) + DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m) + DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; CASE_PMOVZX(PMOVZXBW, r) CASE_PMOVZX(PMOVZXBD, r) CASE_PMOVZX(PMOVZXBQ, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXBW, m) CASE_PMOVZX(PMOVZXBD, m) CASE_PMOVZX(PMOVZXBQ, m) @@ -965,7 +1121,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_PMOVZX(PMOVZXWD, r) CASE_PMOVZX(PMOVZXWQ, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXWD, m) CASE_PMOVZX(PMOVZXWQ, m) DecodeZeroExtendMask(MVT::i16, getZeroExtensionResultType(MI), ShuffleMask); @@ -974,7 +1131,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_PMOVZX(PMOVZXDQ, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); - // FALL THROUGH. + LLVM_FALLTHROUGH; + CASE_PMOVZX(PMOVZXDQ, m) DecodeZeroExtendMask(MVT::i32, getZeroExtensionResultType(MI), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h index 687581b..c6d0d85 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h @@ -16,6 +16,11 @@ #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H namespace llvm { + + enum AsmComments { + AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX. + }; + class MCInst; class raw_ostream; bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 879378f..4443edb 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -253,5 +253,8 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O) { + if (MI->getOperand(Op).isExpr()) + return MI->getOperand(Op).getExpr()->print(O, &MAI); + O << formatImm(MI->getOperand(Op).getImm() & 0xff); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index e77a0dc..e83ec9f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -76,12 +76,12 @@ class X86AsmBackend : public MCAsmBackend { public: X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU), - MaxNopLength((CPU == "slm" || CPU == "lakemont") ? 7 : 15) { + MaxNopLength((CPU == "slm") ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && - CPU != "c3" && CPU != "c3-2"; + CPU != "c3" && CPU != "c3-2" && CPU != "lakemont"; } unsigned getNumFixupKinds() const override { @@ -546,8 +546,12 @@ protected: // .cfi_def_cfa_register %rbp // HasFP = true; - assert(MRI.getLLVMRegNum(Inst.getRegister(), true) == - (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!"); + + // If the frame pointer is other than esp/rsp, we do not have a way to + // generate a compact unwinding representation, so bail out. + if (MRI.getLLVMRegNum(Inst.getRegister(), true) != + (Is64Bit ? X86::RBP : X86::EBP)) + return 0; // Reset the counts. memset(SavedRegs, 0, sizeof(SavedRegs)); @@ -837,7 +841,8 @@ public: MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TheTriple, - StringRef CPU) { + StringRef CPU, + const MCTargetOptions &Options) { if (TheTriple.isOSBinFormatMachO()) return new DarwinX86_32AsmBackend(T, MRI, CPU); @@ -855,7 +860,8 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TheTriple, - StringRef CPU) { + StringRef CPU, + const MCTargetOptions &Options) { if (TheTriple.isOSBinFormatMachO()) { MachO::CPUSubTypeX86 CS = StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName()) diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index b419517..aab5525 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -234,88 +234,114 @@ namespace X86II { /// their one register operand added to their opcode. AddRegFrm = 2, - /// MRMDestReg - This form is used for instructions that use the Mod/RM byte - /// to specify a destination, which in this case is a register. - /// - MRMDestReg = 3, - - /// MRMDestMem - This form is used for instructions that use the Mod/RM byte - /// to specify a destination, which in this case is memory. - /// - MRMDestMem = 4, - - /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte - /// to specify a source, which in this case is a register. - /// - MRMSrcReg = 5, - - /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte - /// to specify a source, which in this case is memory. - /// - MRMSrcMem = 6, - /// RawFrmMemOffs - This form is for instructions that store an absolute /// memory offset as an immediate with a possible segment override. - RawFrmMemOffs = 7, + RawFrmMemOffs = 3, /// RawFrmSrc - This form is for instructions that use the source index /// register SI/ESI/RSI with a possible segment override. - RawFrmSrc = 8, + RawFrmSrc = 4, /// RawFrmDst - This form is for instructions that use the destination index /// register DI/EDI/ESI. - RawFrmDst = 9, + RawFrmDst = 5, /// RawFrmSrc - This form is for instructions that use the source index /// register SI/ESI/ERI with a possible segment override, and also the /// destination index register DI/ESI/RDI. - RawFrmDstSrc = 10, + RawFrmDstSrc = 6, /// RawFrmImm8 - This is used for the ENTER instruction, which has two /// immediates, the first of which is a 16-bit immediate (specified by /// the imm encoding) and the second is a 8-bit fixed value. - RawFrmImm8 = 11, + RawFrmImm8 = 7, /// RawFrmImm16 - This is used for CALL FAR instructions, which have two /// immediates, the first of which is a 16 or 32-bit immediate (specified by /// the imm encoding) and the second is a 16-bit fixed value. In the AMD /// manual, this operand is described as pntr16:32 and pntr16:16 - RawFrmImm16 = 12, - - /// MRMX[rm] - The forms are used to represent instructions that use a - /// Mod/RM byte, and don't use the middle field for anything. - MRMXr = 14, MRMXm = 15, + RawFrmImm16 = 8, /// MRM[0-7][rm] - These forms are used to represent instructions that use /// a Mod/RM byte, and use the middle field to hold extended opcode /// information. In the intel manual these are represented as /0, /1, ... /// - // First, instructions that operate on a register r/m operand... - MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3 - MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7 + /// MRMDestMem - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is memory. + /// + MRMDestMem = 32, + + /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is memory. + /// + MRMSrcMem = 33, + + /// MRMSrcMem4VOp3 - This form is used for instructions that encode + /// operand 3 with VEX.VVVV and load from memory. + /// + MRMSrcMem4VOp3 = 34, + + /// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM + /// byte to specify the fourth source, which in this case is memory. + /// + MRMSrcMemOp4 = 35, + + /// MRMXm - This form is used for instructions that use the Mod/RM byte + /// to specify a memory source, but doesn't use the middle field. + /// + MRMXm = 39, // Instruction that uses Mod/RM but not the middle field. // Next, instructions that operate on a memory r/m operand... - MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3 - MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7 - - //// MRM_XX - A mod/rm byte of exactly 0xXX. - MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, - MRM_C4 = 36, MRM_C5 = 37, MRM_C6 = 38, MRM_C7 = 39, - MRM_C8 = 40, MRM_C9 = 41, MRM_CA = 42, MRM_CB = 43, - MRM_CC = 44, MRM_CD = 45, MRM_CE = 46, MRM_CF = 47, - MRM_D0 = 48, MRM_D1 = 49, MRM_D2 = 50, MRM_D3 = 51, - MRM_D4 = 52, MRM_D5 = 53, MRM_D6 = 54, MRM_D7 = 55, - MRM_D8 = 56, MRM_D9 = 57, MRM_DA = 58, MRM_DB = 59, - MRM_DC = 60, MRM_DD = 61, MRM_DE = 62, MRM_DF = 63, - MRM_E0 = 64, MRM_E1 = 65, MRM_E2 = 66, MRM_E3 = 67, - MRM_E4 = 68, MRM_E5 = 69, MRM_E6 = 70, MRM_E7 = 71, - MRM_E8 = 72, MRM_E9 = 73, MRM_EA = 74, MRM_EB = 75, - MRM_EC = 76, MRM_ED = 77, MRM_EE = 78, MRM_EF = 79, - MRM_F0 = 80, MRM_F1 = 81, MRM_F2 = 82, MRM_F3 = 83, - MRM_F4 = 84, MRM_F5 = 85, MRM_F6 = 86, MRM_F7 = 87, - MRM_F8 = 88, MRM_F9 = 89, MRM_FA = 90, MRM_FB = 91, - MRM_FC = 92, MRM_FD = 93, MRM_FE = 94, MRM_FF = 95, + MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3 + MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47, // Format /4 /5 /6 /7 + + /// MRMDestReg - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is a register. + /// + MRMDestReg = 48, + + /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is a register. + /// + MRMSrcReg = 49, + + /// MRMSrcReg4VOp3 - This form is used for instructions that encode + /// operand 3 with VEX.VVVV and do not load from memory. + /// + MRMSrcReg4VOp3 = 50, + + /// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM + /// byte to specify the fourth source, which in this case is a register. + /// + MRMSrcRegOp4 = 51, + + /// MRMXr - This form is used for instructions that use the Mod/RM byte + /// to specify a register source, but doesn't use the middle field. + /// + MRMXr = 55, // Instruction that uses Mod/RM but not the middle field. + + // Instructions that operate on a register r/m operand... + MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3 + MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63, // Format /4 /5 /6 /7 + + /// MRM_XX - A mod/rm byte of exactly 0xXX. + MRM_C0 = 64, MRM_C1 = 65, MRM_C2 = 66, MRM_C3 = 67, + MRM_C4 = 68, MRM_C5 = 69, MRM_C6 = 70, MRM_C7 = 71, + MRM_C8 = 72, MRM_C9 = 73, MRM_CA = 74, MRM_CB = 75, + MRM_CC = 76, MRM_CD = 77, MRM_CE = 78, MRM_CF = 79, + MRM_D0 = 80, MRM_D1 = 81, MRM_D2 = 82, MRM_D3 = 83, + MRM_D4 = 84, MRM_D5 = 85, MRM_D6 = 86, MRM_D7 = 87, + MRM_D8 = 88, MRM_D9 = 89, MRM_DA = 90, MRM_DB = 91, + MRM_DC = 92, MRM_DD = 93, MRM_DE = 94, MRM_DF = 95, + MRM_E0 = 96, MRM_E1 = 97, MRM_E2 = 98, MRM_E3 = 99, + MRM_E4 = 100, MRM_E5 = 101, MRM_E6 = 102, MRM_E7 = 103, + MRM_E8 = 104, MRM_E9 = 105, MRM_EA = 106, MRM_EB = 107, + MRM_EC = 108, MRM_ED = 109, MRM_EE = 110, MRM_EF = 111, + MRM_F0 = 112, MRM_F1 = 113, MRM_F2 = 114, MRM_F3 = 115, + MRM_F4 = 116, MRM_F5 = 117, MRM_F6 = 118, MRM_F7 = 119, + MRM_F8 = 120, MRM_F9 = 121, MRM_FA = 122, MRM_FB = 123, + MRM_FC = 124, MRM_FD = 125, MRM_FE = 126, MRM_FF = 127, FormMask = 127, @@ -403,12 +429,13 @@ namespace X86II { ImmMask = 15 << ImmShift, Imm8 = 1 << ImmShift, Imm8PCRel = 2 << ImmShift, - Imm16 = 3 << ImmShift, - Imm16PCRel = 4 << ImmShift, - Imm32 = 5 << ImmShift, - Imm32PCRel = 6 << ImmShift, - Imm32S = 7 << ImmShift, - Imm64 = 8 << ImmShift, + Imm8Reg = 3 << ImmShift, + Imm16 = 4 << ImmShift, + Imm16PCRel = 5 << ImmShift, + Imm32 = 6 << ImmShift, + Imm32PCRel = 7 << ImmShift, + Imm32S = 8 << ImmShift, + Imm64 = 9 << ImmShift, //===------------------------------------------------------------------===// // FP Instruction Classification... Zero is non-fp instruction. @@ -488,39 +515,15 @@ namespace X86II { VEX_4VShift = VEX_WShift + 1, VEX_4V = 1ULL << VEX_4VShift, - /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode - /// operand 3 with VEX.vvvv. - VEX_4VOp3Shift = VEX_4VShift + 1, - VEX_4VOp3 = 1ULL << VEX_4VOp3Shift, - - /// VEX_I8IMM - Specifies that the last register used in a AVX instruction, - /// must be encoded in the i8 immediate field. This usually happens in - /// instructions with 4 operands. - VEX_I8IMMShift = VEX_4VOp3Shift + 1, - VEX_I8IMM = 1ULL << VEX_I8IMMShift, - /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current /// instruction uses 256-bit wide registers. This is usually auto detected /// if a VR256 register is used, but some AVX instructions also have this /// field marked when using a f256 memory references. - VEX_LShift = VEX_I8IMMShift + 1, + VEX_LShift = VEX_4VShift + 1, VEX_L = 1ULL << VEX_LShift, - // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX - // prefix. Usually used for scalar instructions. Needed by disassembler. - VEX_LIGShift = VEX_LShift + 1, - VEX_LIG = 1ULL << VEX_LIGShift, - - // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field - // with following encoding: - // - 00 V128 - // - 01 V256 - // - 10 V512 - // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros. - // this will save 1 tsflag bit - // EVEX_K - Set if this instruction requires masking - EVEX_KShift = VEX_LIGShift + 1, + EVEX_KShift = VEX_LShift + 1, EVEX_K = 1ULL << EVEX_KShift, // EVEX_Z - Set if this instruction has EVEX.Z field set. @@ -548,13 +551,8 @@ namespace X86II { Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7, Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift, - /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in - /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. - MemOp4Shift = Has3DNow0F0FOpcodeShift + 1, - MemOp4 = 1ULL << MemOp4Shift, - /// Explicitly specified rounding control - EVEX_RCShift = MemOp4Shift + 1, + EVEX_RCShift = Has3DNow0F0FOpcodeShift + 1, EVEX_RC = 1ULL << EVEX_RCShift }; @@ -575,7 +573,8 @@ namespace X86II { switch (TSFlags & X86II::ImmMask) { default: llvm_unreachable("Unknown immediate size"); case X86II::Imm8: - case X86II::Imm8PCRel: return 1; + case X86II::Imm8PCRel: + case X86II::Imm8Reg: return 1; case X86II::Imm16: case X86II::Imm16PCRel: return 2; case X86II::Imm32: @@ -595,6 +594,7 @@ namespace X86II { case X86II::Imm32PCRel: return true; case X86II::Imm8: + case X86II::Imm8Reg: case X86II::Imm16: case X86II::Imm32: case X86II::Imm32S: @@ -612,6 +612,7 @@ namespace X86II { return true; case X86II::Imm8: case X86II::Imm8PCRel: + case X86II::Imm8Reg: case X86II::Imm16: case X86II::Imm16PCRel: case X86II::Imm32: @@ -626,26 +627,25 @@ namespace X86II { /// in this instruction. /// If this is a two-address instruction,skip one of the register operands. /// FIXME: This should be handled during MCInst lowering. - inline int getOperandBias(const MCInstrDesc& Desc) + inline unsigned getOperandBias(const MCInstrDesc& Desc) { unsigned NumOps = Desc.getNumOperands(); - unsigned CurOp = 0; if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) - ++CurOp; - else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && - Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) + return 1; + if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) // Special case for AVX-512 GATHER with 2 TIED_TO operands // Skip the first 2 operands: dst, mask_wb - CurOp += 2; - else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && - Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) + return 2; + if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) // Special case for GATHER with 2 TIED_TO operands // Skip the first 2 operands: dst, mask_wb - CurOp += 2; - else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) + return 2; + if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) // SCATTER - ++CurOp; - return CurOp; + return 1; + return 0; } /// getMemoryOperandNo - The function returns the MCInst operand # for the @@ -658,7 +658,6 @@ namespace X86II { /// inline int getMemoryOperandNo(uint64_t TSFlags) { bool HasVEX_4V = TSFlags & X86II::VEX_4V; - bool HasMemOp4 = TSFlags & X86II::MemOp4; bool HasEVEX_K = TSFlags & X86II::EVEX_K; switch (TSFlags & X86II::FormMask) { @@ -666,8 +665,6 @@ namespace X86II { case X86II::Pseudo: case X86II::RawFrm: case X86II::AddRegFrm: - case X86II::MRMDestReg: - case X86II::MRMSrcReg: case X86II::RawFrmImm8: case X86II::RawFrmImm16: case X86II::RawFrmMemOffs: @@ -680,7 +677,17 @@ namespace X86II { case X86II::MRMSrcMem: // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a // mask register. - return 1 + HasVEX_4V + HasMemOp4 + HasEVEX_K; + return 1 + HasVEX_4V + HasEVEX_K; + case X86II::MRMSrcMem4VOp3: + // Skip registers encoded in reg. + return 1 + HasEVEX_K; + case X86II::MRMSrcMemOp4: + // Skip registers encoded in reg, VEX_VVVV, and I8IMM. + return 3; + case X86II::MRMDestReg: + case X86II::MRMSrcReg: + case X86II::MRMSrcReg4VOp3: + case X86II::MRMSrcRegOp4: case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: @@ -723,12 +730,9 @@ namespace X86II { /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or /// higher) register? e.g. r8, xmm8, xmm13, etc. inline bool isX86_64ExtendedReg(unsigned RegNo) { - if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM15) || - (RegNo >= X86::XMM24 && RegNo <= X86::XMM31) || - (RegNo >= X86::YMM8 && RegNo <= X86::YMM15) || - (RegNo >= X86::YMM24 && RegNo <= X86::YMM31) || - (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM15) || - (RegNo >= X86::ZMM24 && RegNo <= X86::ZMM31)) + if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM31) || + (RegNo >= X86::YMM8 && RegNo <= X86::YMM31) || + (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM31)) return true; switch (RegNo) { @@ -743,6 +747,8 @@ namespace X86II { case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: + case X86::DR8: case X86::DR9: case X86::DR10: case X86::DR11: + case X86::DR12: case X86::DR13: case X86::DR14: case X86::DR15: return true; } return false; @@ -761,6 +767,16 @@ namespace X86II { return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); } + + /// isKMasked - Is this a masked instruction. + inline bool isKMasked(uint64_t TSFlags) { + return (TSFlags & X86II::EVEX_K) != 0; + } + + /// isKMergedMasked - Is this a merge masked instruction. + inline bool isKMergeMasked(uint64_t TSFlags) { + return isKMasked(TSFlags) && (TSFlags & X86II::EVEX_Z) == 0; + } } } // end namespace llvm; diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index b7c56ce..48a1d8f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -31,8 +31,7 @@ static cl::opt<AsmWriterFlavorTy> AsmWriterFlavor("x86-asm-syntax", cl::init(ATT), cl::desc("Choose style of code to emit from X86 backend:"), cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"), - clEnumValN(Intel, "intel", "Emit Intel-style assembly"), - clEnumValEnd)); + clEnumValN(Intel, "intel", "Emit Intel-style assembly"))); static cl::opt<bool> MarkedJTDataRegions("mark-data-regions", cl::init(true), diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 96c2e81..8045e7c 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -81,7 +81,8 @@ public: MI.getOperand(OpNum).getReg()); } - bool isX86_64ExtendedReg(const MCInst &MI, unsigned OpNum) const { + // Does this register require a bit to be set in REX prefix. + bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const { return (getX86RegEncoding(MI, OpNum) >> 3) & 1; } @@ -602,8 +603,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, uint64_t Encoding = TSFlags & X86II::EncodingMask; bool HasEVEX_K = TSFlags & X86II::EVEX_K; bool HasVEX_4V = TSFlags & X86II::VEX_4V; - bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3; - bool HasMemOp4 = TSFlags & X86II::MemOp4; bool HasEVEX_RC = TSFlags & X86II::EVEX_RC; // VEX_R: opcode externsion equivalent to REX.R in @@ -745,11 +744,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // src1(ModR/M), MemAddr // src1(ModR/M), src2(VEX_4V), MemAddr // src1(ModR/M), MemAddr, imm8 - // src1(ModR/M), MemAddr, src2(VEX_I8IMM) + // src1(ModR/M), MemAddr, src2(Imm[7:4]) // // FMA4: - // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) - // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4]) unsigned RegEnc = getX86RegEncoding(MI, CurOp++); VEX_R = ~(RegEnc >> 3) & 1; EVEX_R2 = ~(RegEnc >> 4) & 1; @@ -770,13 +768,34 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV. EVEX_V2 = ~(IndexRegEnc >> 4) & 1; - if (HasVEX_4VOp3) - // Instruction format for 4VOp3: - // src1(ModR/M), MemAddr, src3(VEX_4V) - // CurOp points to start of the MemoryOperand, - // it skips TIED_TO operands if exist, then increments past src1. - // CurOp + X86::AddrNumOperands will point to src3. - VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf; + break; + } + case X86II::MRMSrcMem4VOp3: { + // Instruction format for 4VOp3: + // src1(ModR/M), MemAddr, src3(VEX_4V) + unsigned RegEnc = getX86RegEncoding(MI, CurOp++); + VEX_R = ~(RegEnc >> 3) & 1; + + unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg); + VEX_B = ~(BaseRegEnc >> 3) & 1; + unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); + VEX_X = ~(IndexRegEnc >> 3) & 1; + + VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf; + break; + } + case X86II::MRMSrcMemOp4: { + // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M), + unsigned RegEnc = getX86RegEncoding(MI, CurOp++); + VEX_R = ~(RegEnc >> 3) & 1; + + unsigned VRegEnc = getX86RegEncoding(MI, CurOp++); + VEX_4V = ~VRegEnc & 0xf; + + unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg); + VEX_B = ~(BaseRegEnc >> 3) & 1; + unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); + VEX_X = ~(IndexRegEnc >> 3) & 1; break; } case X86II::MRM0m: case X86II::MRM1m: @@ -803,13 +822,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, } case X86II::MRMSrcReg: { // MRMSrcReg instructions forms: - // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4]) // dst(ModR/M), src1(ModR/M) // dst(ModR/M), src1(ModR/M), imm8 // // FMA4: - // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) - // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), + // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M), unsigned RegEnc = getX86RegEncoding(MI, CurOp++); VEX_R = ~(RegEnc >> 3) & 1; EVEX_R2 = ~(RegEnc >> 4) & 1; @@ -823,14 +841,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EVEX_V2 = ~(VRegEnc >> 4) & 1; } - if (HasMemOp4) // Skip second register source (encoded in I8IMM) - CurOp++; - RegEnc = getX86RegEncoding(MI, CurOp++); VEX_B = ~(RegEnc >> 3) & 1; VEX_X = ~(RegEnc >> 4) & 1; - if (HasVEX_4VOp3) - VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf; + if (EVEX_b) { if (HasEVEX_RC) { unsigned RcOperand = NumOps-1; @@ -841,6 +855,34 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, } break; } + case X86II::MRMSrcReg4VOp3: { + // Instruction format for 4VOp3: + // src1(ModR/M), src2(ModR/M), src3(VEX_4V) + unsigned RegEnc = getX86RegEncoding(MI, CurOp++); + VEX_R = ~(RegEnc >> 3) & 1; + + RegEnc = getX86RegEncoding(MI, CurOp++); + VEX_B = ~(RegEnc >> 3) & 1; + + VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf; + break; + } + case X86II::MRMSrcRegOp4: { + // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M), + unsigned RegEnc = getX86RegEncoding(MI, CurOp++); + VEX_R = ~(RegEnc >> 3) & 1; + + unsigned VRegEnc = getX86RegEncoding(MI, CurOp++); + VEX_4V = ~VRegEnc & 0xf; + + // Skip second register source (encoded in Imm[7:4]) + ++CurOp; + + RegEnc = getX86RegEncoding(MI, CurOp++); + VEX_B = ~(RegEnc >> 3) & 1; + VEX_X = ~(RegEnc >> 4) & 1; + break; + } case X86II::MRMDestReg: { // MRMDestReg instructions forms: // dst(ModR/M), src(ModR/M) @@ -976,52 +1018,51 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, unsigned Reg = MO.getReg(); if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) UsesHighByteReg = true; - if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue; - // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything - // that returns non-zero. - REX |= 0x40; // REX fixed encoding prefix - break; + if (X86II::isX86_64NonExtLowByteReg(Reg)) + // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything + // that returns non-zero. + REX |= 0x40; // REX fixed encoding prefix } switch (TSFlags & X86II::FormMask) { case X86II::AddRegFrm: - REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B + REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B break; case X86II::MRMSrcReg: - REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R - REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R + REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B break; case X86II::MRMSrcMem: { - REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R - REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B - REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R + REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X CurOp += X86::AddrNumOperands; break; } case X86II::MRMDestReg: - REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B - REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R + REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R break; case X86II::MRMDestMem: - REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B - REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X + REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X CurOp += X86::AddrNumOperands; - REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R break; case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: case X86II::MRM6m: case X86II::MRM7m: - REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B - REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X + REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X break; case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: case X86II::MRM6r: case X86II::MRM7r: - REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B + REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B break; } if (REX && UsesHighByteReg) @@ -1133,10 +1174,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, // It uses the VEX.VVVV field? bool HasVEX_4V = TSFlags & X86II::VEX_4V; - bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3; - bool HasMemOp4 = TSFlags & X86II::MemOp4; - bool HasVEX_I8IMM = TSFlags & X86II::VEX_I8IMM; - assert((!HasMemOp4 || HasVEX_I8IMM) && "MemOp4 should imply VEX_I8IMM"); + bool HasVEX_I8Reg = (TSFlags & X86II::ImmMask) == X86II::Imm8Reg; // It uses the EVEX.aaa field? bool HasEVEX_K = TSFlags & X86II::EVEX_K; @@ -1312,21 +1350,42 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; - if (HasMemOp4) // Capture 2nd src (which is encoded in I8IMM) - I8RegNum = getX86RegEncoding(MI, SrcRegNum++); - EmitRegModRMByte(MI.getOperand(SrcRegNum), GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); CurOp = SrcRegNum + 1; - if (HasVEX_4VOp3) - ++CurOp; - if (!HasMemOp4 && HasVEX_I8IMM) + if (HasVEX_I8Reg) I8RegNum = getX86RegEncoding(MI, CurOp++); // do not count the rounding control operand if (HasEVEX_RC) --NumOps; break; } + case X86II::MRMSrcReg4VOp3: { + EmitByte(BaseOpcode, CurByte, OS); + unsigned SrcRegNum = CurOp + 1; + + EmitRegModRMByte(MI.getOperand(SrcRegNum), + GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + CurOp = SrcRegNum + 1; + ++CurOp; // Encoded in VEX.VVVV + break; + } + case X86II::MRMSrcRegOp4: { + EmitByte(BaseOpcode, CurByte, OS); + unsigned SrcRegNum = CurOp + 1; + + // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + // Capture 2nd src (which is encoded in Imm[7:4]) + assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg"); + I8RegNum = getX86RegEncoding(MI, SrcRegNum++); + + EmitRegModRMByte(MI.getOperand(SrcRegNum), + GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + CurOp = SrcRegNum + 1; + break; + } case X86II::MRMSrcMem: { unsigned FirstMemOp = CurOp+1; @@ -1336,20 +1395,42 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). - if (HasMemOp4) // Capture second register source (encoded in I8IMM) - I8RegNum = getX86RegEncoding(MI, FirstMemOp++); - EmitByte(BaseOpcode, CurByte, OS); emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), TSFlags, Rex, CurByte, OS, Fixups, STI); CurOp = FirstMemOp + X86::AddrNumOperands; - if (HasVEX_4VOp3) - ++CurOp; - if (!HasMemOp4 && HasVEX_I8IMM) + if (HasVEX_I8Reg) I8RegNum = getX86RegEncoding(MI, CurOp++); break; } + case X86II::MRMSrcMem4VOp3: { + unsigned FirstMemOp = CurOp+1; + + EmitByte(BaseOpcode, CurByte, OS); + + emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), + TSFlags, Rex, CurByte, OS, Fixups, STI); + CurOp = FirstMemOp + X86::AddrNumOperands; + ++CurOp; // Encoded in VEX.VVVV. + break; + } + case X86II::MRMSrcMemOp4: { + unsigned FirstMemOp = CurOp+1; + + ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). + + // Capture second register source (encoded in Imm[7:4]) + assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg"); + I8RegNum = getX86RegEncoding(MI, FirstMemOp++); + + EmitByte(BaseOpcode, CurByte, OS); + + emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), + TSFlags, Rex, CurByte, OS, Fixups, STI); + CurOp = FirstMemOp + X86::AddrNumOperands; + break; + } case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: @@ -1410,7 +1491,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, break; } - if (HasVEX_I8IMM) { + if (HasVEX_I8Reg) { // The last source register of a 4 operand instruction in AVX is encoded // in bits[7:4] of a immediate byte. assert(I8RegNum < 16 && "Register encoding out of range"); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 311a8d6..22cb0fa 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -234,7 +234,7 @@ static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { // Force static initialization. extern "C" void LLVMInitializeX86TargetMC() { - for (Target *T : {&TheX86_32Target, &TheX86_64Target}) { + for (Target *T : {&getTheX86_32Target(), &getTheX86_64Target()}) { // Register the MC asm info. RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo); @@ -268,9 +268,9 @@ extern "C" void LLVMInitializeX86TargetMC() { } // Register the asm backend. - TargetRegistry::RegisterMCAsmBackend(TheX86_32Target, + TargetRegistry::RegisterMCAsmBackend(getTheX86_32Target(), createX86_32AsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, + TargetRegistry::RegisterMCAsmBackend(getTheX86_64Target(), createX86_64AsmBackend); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index ca4f0d3..f73e734 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -27,13 +27,15 @@ class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class MCRelocationInfo; +class MCTargetOptions; class Target; class Triple; class StringRef; class raw_ostream; class raw_pwrite_stream; -extern Target TheX86_32Target, TheX86_64Target; +Target &getTheX86_32Target(); +Target &getTheX86_64Target(); /// Flavour of dwarf regnumbers /// @@ -69,9 +71,11 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU); + const Triple &TT, StringRef CPU, + const MCTargetOptions &Options); MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU); + const Triple &TT, StringRef CPU, + const MCTargetOptions &Options); /// Construct an X86 Windows COFF machine code streamer which will generate /// PE/COFF format object files. diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp index fceb083..d2654fc 100644 --- a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp +++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -11,12 +11,19 @@ #include "llvm/Support/TargetRegistry.h" using namespace llvm; -Target llvm::TheX86_32Target, llvm::TheX86_64Target; +Target &llvm::getTheX86_32Target() { + static Target TheX86_32Target; + return TheX86_32Target; +} +Target &llvm::getTheX86_64Target() { + static Target TheX86_64Target; + return TheX86_64Target; +} extern "C" void LLVMInitializeX86TargetInfo() { - RegisterTarget<Triple::x86, /*HasJIT=*/true> - X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above"); + RegisterTarget<Triple::x86, /*HasJIT=*/true> X( + getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above"); - RegisterTarget<Triple::x86_64, /*HasJIT=*/true> - Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64"); + RegisterTarget<Triple::x86_64, /*HasJIT=*/true> Y( + getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64"); } diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 18f7167..1be5aec 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -136,7 +136,7 @@ void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); - unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8); + unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumLaneElts = NumElts / NumLanes; @@ -151,6 +151,16 @@ void DecodePALIGNRMask(MVT VT, unsigned Imm, } } +void DecodeVALIGNMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + int NumElts = VT.getVectorNumElements(); + // Not all bits of the immediate are used so mask it. + assert(isPowerOf2_32(NumElts) && "NumElts should be power of 2"); + Imm = Imm & (NumElts - 1); + for (int i = 0; i != NumElts; ++i) + ShuffleMask.push_back(i + Imm); +} + /// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*. /// VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. @@ -538,10 +548,11 @@ void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask, unsigned VecSize = VT.getSizeInBits(); unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumLanes = VecSize / 128; - unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes; - assert((VecSize == 128 || VecSize == 256) && - "Unexpected vector size"); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumEltsPerLane = NumElts / NumLanes; + assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size"); assert((EltSize == 32 || EltSize == 64) && "Unexpected element size"); + assert((NumElts == RawMask.size()) && "Unexpected mask size"); for (unsigned i = 0, e = RawMask.size(); i < e; ++i) { // VPERMIL2 Operation. @@ -562,14 +573,15 @@ void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask, continue; } - unsigned Index = i & ~(NumEltsPerLane - 1); + int Index = i & ~(NumEltsPerLane - 1); if (EltSize == 64) Index += (Selector >> 1) & 0x1; else Index += Selector & 0x3; - unsigned SrcOffset = (Selector >> 2) & 1; - ShuffleMask.push_back((int)(SrcOffset + Index)); + int Src = (Selector >> 2) & 0x1; + Index += Src * NumElts; + ShuffleMask.push_back(Index); } } diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index dc21c19..17619d0 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -55,6 +55,8 @@ void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +void DecodeVALIGNMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + /// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps. /// VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index 23d6c71..2cb80a4 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -87,6 +87,13 @@ FunctionPass *createX86ExpandPseudoPass(); FunctionPass *createX86FixupBWInsts(); void initializeFixupBWInstPassPass(PassRegistry &); + +/// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX +/// encoding when possible in order to reduce code size. +FunctionPass *createX86EvexToVexInsts(); + +void initializeEvexToVexInstPassPass(PassRegistry &); + } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index 8267a84..83a23d4 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -99,6 +99,8 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; +def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true", + "PMULLD instruction is slow">; // FIXME: This should not apply to CPUs that do not have SSE. def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", "IsUAMem16Slow", "true", @@ -141,8 +143,8 @@ def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", "Enable AVX-512 Vector Length eXtensions", [FeatureAVX512]>; def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true", - "Enable AVX-512 Vector Bit Manipulation Instructions", - [FeatureAVX512]>; + "Enable AVX-512 Vector Byte Manipulation Instructions", + [FeatureBWI]>; def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true", "Enable AVX-512 Integer Fused Multiple-Add", [FeatureAVX512]>; @@ -207,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", "HasSlowDivide32", "true", "Use 8-bit divide for positive values less than 256">; -def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", +def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl", "HasSlowDivide64", "true", - "Use 16-bit divide for positive values less than 65536">; + "Use 32-bit divide for positive values less than 2^32">; def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; @@ -249,6 +251,25 @@ def FeatureSoftFloat def FeatureFastPartialYMMWrite : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite", "true", "Partial writes to YMM registers are fast">; +// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency +// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if +// vector FSQRT has higher throughput than the corresponding NR code. +// The idea is that throughput bound code is likely to be vectorized, so for +// vectorized code we should care about the throughput of SQRT operations. +// But if the code is scalar that probably means that the code has some kind of +// dependency and we should care more about reducing the latency. +def FeatureFastScalarFSQRT + : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT", + "true", "Scalar SQRT is fast (disable Newton-Raphson)">; +def FeatureFastVectorFSQRT + : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT", + "true", "Vector SQRT is fast (disable Newton-Raphson)">; +// If lzcnt has equivalent latency/throughput to most simple integer ops, it can +// be used to replace test/set sequences. +def FeatureFastLZCNT + : SubtargetFeature< + "fast-lzcnt", "HasFastLZCNT", "true", + "LZCNT instructions are as fast as most simple integer ops">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -384,6 +405,7 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ FeatureSlowLEA, FeatureSlowIncDec, FeatureSlowBTMem, + FeatureSlowPMULLD, FeatureLAHFSAHF ]>; def : SilvermontProc<"silvermont">; @@ -439,10 +461,12 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureCMPXCHG16B, FeaturePOPCNT, FeatureAES, + FeatureSlowDivide64, FeaturePCLMUL, FeatureXSAVE, FeatureXSAVEOPT, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureFastScalarFSQRT ]>; class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, @@ -500,7 +524,8 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ FeatureXSAVEC, FeatureXSAVES, FeatureSGX, - FeatureCLFLUSHOPT + FeatureCLFLUSHOPT, + FeatureFastVectorFSQRT ]>; // FIXME: define SKL model @@ -631,6 +656,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [ FeatureF16C, FeatureMOVBE, FeatureLZCNT, + FeatureFastLZCNT, FeaturePOPCNT, FeatureXSAVE, FeatureXSAVEOPT, @@ -729,11 +755,48 @@ def : Proc<"bdver4", [ FeatureTBM, FeatureFMA, FeatureXSAVEOPT, + FeatureSlowSHLD, FeatureFSGSBase, FeatureLAHFSAHF, FeatureMWAITX ]>; +// TODO: The scheduler model falls to BTVER2 model. +// The znver1 model has to be put in place. +// Zen +def: ProcessorModel<"znver1", BtVer2Model, [ + FeatureADX, + FeatureAES, + FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureCLFLUSHOPT, + FeatureCMPXCHG16B, + FeatureF16C, + FeatureFMA, + FeatureFSGSBase, + FeatureFXSR, + FeatureFastLZCNT, + FeatureLAHFSAHF, + FeatureLZCNT, + FeatureMMX, + FeatureMOVBE, + FeatureMWAITX, + FeaturePCLMUL, + FeaturePOPCNT, + FeaturePRFCHW, + FeatureRDRAND, + FeatureRDSEED, + FeatureSHA, + FeatureSMAP, + FeatureSSE4A, + FeatureSlowSHLD, + FeatureX87, + FeatureXSAVE, + FeatureXSAVEC, + FeatureXSAVEOPT, + FeatureXSAVES]>; + def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index 67e51f1..e1825ca 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -57,10 +57,10 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { SetupMachineFunction(MF); if (Subtarget->isTargetCOFF()) { - bool Intrn = MF.getFunction()->hasInternalLinkage(); + bool Local = MF.getFunction()->hasLocalLinkage(); OutStreamer->BeginCOFFSymbolDef(CurrentFnSym); - OutStreamer->EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC - : COFF::IMAGE_SYM_CLASS_EXTERNAL); + OutStreamer->EmitCOFFSymbolStorageClass( + Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL); OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT); OutStreamer->EndCOFFSymbolDef(); @@ -70,7 +70,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { EmitFunctionBody(); // Emit the XRay table for this function. - EmitXRayTable(); + emitXRayTable(); // We didn't modify anything. return false; @@ -627,11 +627,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { raw_string_ostream FlagsOS(Flags); for (const auto &Function : M) - TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function, *Mang); + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function); for (const auto &Global : M.globals()) - TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global, *Mang); + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global); for (const auto &Alias : M.aliases()) - TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias, *Mang); + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias); FlagsOS.flush(); @@ -656,6 +656,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { // Force static initialization. extern "C" void LLVMInitializeX86AsmPrinter() { - RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target); - RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target); + RegisterAsmPrinter<X86AsmPrinter> X(getTheX86_32Target()); + RegisterAsmPrinter<X86AsmPrinter> Y(getTheX86_64Target()); } diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index dcb7b5a..6798253 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -71,27 +71,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { StackMapShadowTracker SMShadowTracker; - // This describes the kind of sled we're storing in the XRay table. - enum class SledKind : uint8_t { - FUNCTION_ENTER = 0, - FUNCTION_EXIT = 1, - TAIL_CALL = 2, - }; - - // The table will contain these structs that point to the sled, the function - // containing the sled, and what kind of sled (and whether they should always - // be instrumented). - struct XRayFunctionEntry { - const MCSymbol *Sled; - const MCSymbol *Function; - SledKind Kind; - bool AlwaysInstrument; - const class Function *Fn; - }; - - // All the sleds to be emitted. - std::vector<XRayFunctionEntry> Sleds; - // All instructions emitted by the X86AsmPrinter should use this helper // method. // @@ -117,15 +96,13 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // function. void EmitXRayTable(); - // Helper function to record a given XRay sled. - void recordSled(MCSymbol *Sled, const MachineInstr &MI, SledKind Kind); public: explicit X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {} - const char *getPassName() const override { - return "X86 Assembly / Object Emitter"; + StringRef getPassName() const override { + return "X86 Assembly Printer"; } const X86Subtarget &getSubtarget() const { return *Subtarget; } diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index 8f6fc40..844c66d 100644 --- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -100,7 +100,7 @@ private: const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs); - const char *getPassName() const override { return "X86 Optimize Call Frame"; } + StringRef getPassName() const override { return "X86 Optimize Call Frame"; } const TargetInstrInfo *TII; const X86FrameLowering *TFL; @@ -134,7 +134,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // in the compact unwind encoding that Darwin uses. So, bail if there // is a danger of that being generated. if (STI->isTargetDarwin() && - (!MF.getMMI().getLandingPads().empty() || + (!MF.getLandingPads().empty() || (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF)))) return false; @@ -180,7 +180,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, // This transformation is always a win when we do not expect to have // a reserved call frame. Under other circumstances, it may be either // a win or a loss, and requires a heuristic. - bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); + bool CannotReserveFrame = MF.getFrameInfo().hasVarSizedObjects(); if (CannotReserveFrame) return true; @@ -230,7 +230,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size"); Log2SlotSize = Log2_32(SlotSize); - if (!isLegal(MF)) + if (skipFunction(*MF.getFunction()) || !isLegal(MF)) return false; unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); @@ -345,10 +345,10 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, return; } - // For globals in PIC mode, we can have some LEAs here. - // Ignore them, they don't bother us. + // Skip over DEBUG_VALUE. + // For globals in PIC mode, we can have some LEAs here. Skip them as well. // TODO: Extend this to something that covers more cases. - while (I->getOpcode() == X86::LEA32r) + while (I->getOpcode() == X86::LEA32r || I->isDebugValue()) ++I; unsigned StackPtr = RegInfo.getStackRegister(); diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp new file mode 100644 index 0000000..5ae4962 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp @@ -0,0 +1,46 @@ +//===-- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "X86CallLowering.h" +#include "X86ISelLowering.h" +#include "X86InstrInfo.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "This shouldn't be built without GISel" +#endif + +X86CallLowering::X86CallLowering(const X86TargetLowering &TLI) + : CallLowering(&TLI) {} + +bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, unsigned VReg) const { + // TODO: handle functions returning non-void values. + if (Val) + return false; + + MIRBuilder.buildInstr(X86::RET).addImm(0); + + return true; +} + +bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function &F, + ArrayRef<unsigned> VRegs) const { + // TODO: handle functions with one or more arguments. + return F.arg_empty(); +} diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm/lib/Target/X86/X86CallLowering.h new file mode 100644 index 0000000..f2672f0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallLowering.h @@ -0,0 +1,39 @@ +//===-- llvm/lib/Target/X86/X86CallLowering.h - Call lowering -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING +#define LLVM_LIB_TARGET_X86_X86CALLLOWERING + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" + +namespace llvm { + +class Function; +class MachineIRBuilder; +class X86TargetLowering; +class Value; + +class X86CallLowering : public CallLowering { +public: + X86CallLowering(const X86TargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, + unsigned VReg) const override; + + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<unsigned> VRegs) const override; +}; +} // End of namespace llvm; +#endif diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp new file mode 100644 index 0000000..c96e76b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp @@ -0,0 +1,208 @@ +//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of custom routines for the X86
+// Calling Convention that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // List of GPR registers that are available to store values in regcall
+ // calling convention.
+ static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
+ X86::ESI};
+
+ // The vector will save all the available registers for allocation.
+ SmallVector<unsigned, 5> AvailableRegs;
+
+ // searching for the available registers.
+ for (auto Reg : RegList) {
+ if (!State.isAllocated(Reg))
+ AvailableRegs.push_back(Reg);
+ }
+
+ const size_t RequiredGprsUponSplit = 2;
+ if (AvailableRegs.size() < RequiredGprsUponSplit)
+ return false; // Not enough free registers - continue the search.
+
+ // Allocating the available registers.
+ for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
+
+ // Marking the register as located.
+ unsigned Reg = State.AllocateReg(AvailableRegs[I]);
+
+ // Since we previously made sure that 2 registers are available
+ // we expect that a real register number will be returned.
+ assert(Reg && "Expecting a register will be available");
+
+ // Assign the value to the allocated register
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ }
+
+ // Successful in allocating regsiters - stop scanning next rules.
+ return true;
+}
+
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
+ if (ValVT.is512BitVector()) {
+ static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
+ X86::ZMM3, X86::ZMM4, X86::ZMM5};
+ return makeArrayRef(std::begin(RegListZMM), std::end(RegListZMM));
+ }
+
+ if (ValVT.is256BitVector()) {
+ static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
+ X86::YMM3, X86::YMM4, X86::YMM5};
+ return makeArrayRef(std::begin(RegListYMM), std::end(RegListYMM));
+ }
+
+ static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+ X86::XMM3, X86::XMM4, X86::XMM5};
+ return makeArrayRef(std::begin(RegListXMM), std::end(RegListXMM));
+}
+
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
+ static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
+ return makeArrayRef(std::begin(RegListGPR), std::end(RegListGPR));
+}
+
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+
+ ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
+ bool Is64bit = static_cast<const X86Subtarget &>(
+ State.getMachineFunction().getSubtarget())
+ .is64Bit();
+
+ for (auto Reg : RegList) {
+ // If the register is not marked as allocated - assign to it.
+ if (!State.isAllocated(Reg)) {
+ unsigned AssigedReg = State.AllocateReg(Reg);
+ assert(AssigedReg == Reg && "Expecting a valid register allocation");
+ State.addLoc(
+ CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
+ return true;
+ }
+ // If the register is marked as shadow allocated - assign to it.
+ if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ }
+
+ llvm_unreachable("Clang should ensure that hva marked vectors will have "
+ "an available register.");
+ return false;
+}
+
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // On the second pass, go through the HVAs only.
+ if (ArgFlags.isSecArgPass()) {
+ if (ArgFlags.isHva())
+ return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags, State);
+ return true;
+ }
+
+ // Process only vector types as defined by vectorcall spec:
+ // "A vector type is either a floating-point type, for example,
+ // a float or double, or an SIMD vector type, for example, __m128 or __m256".
+ if (!(ValVT.isFloatingPoint() ||
+ (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+ // If R9 was already assigned it means that we are after the fourth element
+ // and because this is not an HVA / Vector type, we need to allocate
+ // shadow XMM register.
+ if (State.isAllocated(X86::R9)) {
+ // Assign shadow XMM register.
+ (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
+ }
+
+ return false;
+ }
+
+ if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
+ // Assign shadow GPR register.
+ (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
+
+ // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
+ if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+ // In Vectorcall Calling convention, additional shadow stack can be
+ // created on top of the basic 32 bytes of win64.
+ // It can happen if the fifth or sixth argument is vector type or HVA.
+ // At that case for each argument a shadow stack of 8 bytes is allocated.
+ if (Reg == X86::XMM4 || Reg == X86::XMM5)
+ State.AllocateStack(8, 8);
+
+ if (!ArgFlags.isHva()) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true; // Allocated a register - Stop the search.
+ }
+ }
+ }
+
+ // If this is an HVA - Stop the search,
+ // otherwise continue the search.
+ return ArgFlags.isHva();
+}
+
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // On the second pass, go through the HVAs only.
+ if (ArgFlags.isSecArgPass()) {
+ if (ArgFlags.isHva())
+ return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags, State);
+ return true;
+ }
+
+ // Process only vector types as defined by vectorcall spec:
+ // "A vector type is either a floating point type, for example,
+ // a float or double, or an SIMD vector type, for example, __m128 or __m256".
+ if (!(ValVT.isFloatingPoint() ||
+ (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+ return false;
+ }
+
+ if (ArgFlags.isHva())
+ return true; // If this is an HVA - Stop the search.
+
+ // Assign XMM register.
+ if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+
+ // In case we did not find an available XMM register for a vector -
+ // pass it indirectly.
+ // It is similar to CCPassIndirect, with the addition of inreg.
+ if (!ValVT.isFloatingPoint()) {
+ LocVT = MVT::i32;
+ LocInfo = CCValAssign::Indirect;
+ ArgFlags.setInReg();
+ }
+
+ return false; // No register was assigned - Continue the search.
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h index a08160f..c49a683 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.h +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -21,18 +21,32 @@ namespace llvm { -inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - // Similar to CCPassIndirect, with the addition of inreg. - LocVT = MVT::i32; - LocInfo = CCValAssign::Indirect; - ArgFlags.setInReg(); - return false; // Continue the search, but now for i32. -} - +/// When regcall calling convention compiled to 32 bit arch, special treatment +/// is required for 64 bit masks. +/// The value should be assigned to two GPRs. +/// \return true if registers were allocated and false otherwise. +bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State); + +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 64 bit arch. +/// For HVAs shadow registers might be allocated on the first pass +/// and actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State); + +/// Vectorcall calling convention has special handling for vector types or +/// HVA for 32 bit arch. +/// For HVAs actual XMM registers are allocated on the second pass. +/// For vector types, actual XMM registers are allocated on the first pass. +/// \return true if registers were allocated and false otherwise. +bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State); inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, CCValAssign::LocInfo &, ISD::ArgFlagsTy &, diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index 4cb62b5..cf7bc98 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -18,6 +18,179 @@ class CCIfSubtarget<string F, CCAction A> "(State.getMachineFunction().getSubtarget()).", F), A>; +// Register classes for RegCall +class RC_X86_RegCall { + list<Register> GPR_8 = []; + list<Register> GPR_16 = []; + list<Register> GPR_32 = []; + list<Register> GPR_64 = []; + list<Register> FP_CALL = [FP0]; + list<Register> FP_RET = [FP0, FP1]; + list<Register> XMM = []; + list<Register> YMM = []; + list<Register> ZMM = []; +} + +// RegCall register classes for 32 bits +def RC_X86_32_RegCall : RC_X86_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL]; + let GPR_16 = [AX, CX, DX, DI, SI]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI]; + let GPR_64 = [RAX]; ///< Not actually used, but AssignToReg can't handle [] + ///< \todo Fix AssignToReg to enable empty lists + let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]; + let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]; + let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]; +} + +class RC_X86_64_RegCall : RC_X86_RegCall { + let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]; + let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, + YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15]; + let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7, + ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM14, ZMM15]; +} + +def RC_X86_64_RegCall_Win : RC_X86_64_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R10B, R11B, R12B, R14B, R15B]; + let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R10W, R11W, R12W, R14W, R15W]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R10D, R11D, R12D, R14D, R15D]; + let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15]; +} + +def RC_X86_64_RegCall_SysV : RC_X86_64_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R12B, R13B, R14B, R15B]; + let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R12W, R13W, R14W, R15W]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R12D, R13D, R14D, R15D]; + let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R12, R13, R14, R15]; +} + +// X86-64 Intel regcall calling convention. +multiclass X86_RegCall_base<RC_X86_RegCall RC> { +def CC_#NAME : CallingConv<[ + // Handles byval parameters. + CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>, + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // Promote v8i1/v16i1/v32i1 arguments to i32. + CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>, + + // bool, char, int, enum, long, pointer --> GPR + CCIfType<[i32], CCAssignToReg<RC.GPR_32>>, + + // long long, __int64 --> GPR + CCIfType<[i64], CCAssignToReg<RC.GPR_64>>, + + // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32) + CCIfType<[v64i1], CCPromoteToType<i64>>, + CCIfSubtarget<"is64Bit()", CCIfType<[i64], + CCAssignToReg<RC.GPR_64>>>, + CCIfSubtarget<"is32Bit()", CCIfType<[i64], + CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>, + + // float, double, float128 --> XMM + // In the case of SSE disabled --> save to stack + CCIfType<[f32, f64, f128], + CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>, + + // long double --> FP + CCIfType<[f80], CCAssignToReg<RC.FP_CALL>>, + + // __m128, __m128i, __m128d --> XMM + // In the case of SSE disabled --> save to stack + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>, + + // __m256, __m256i, __m256d --> YMM + // In the case of SSE disabled --> save to stack + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>, + + // __m512, __m512i, __m512d --> ZMM + // In the case of SSE disabled --> save to stack + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()",CCAssignToReg<RC.ZMM>>>, + + // If no register was found -> assign to stack + + // In 64 bit, assign 64/32 bit values to 8 byte stack + CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64], + CCAssignToStack<8, 8>>>, + + // In 32 bit, assign 64/32 bit values to 8/4 byte stack + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64], CCAssignToStack<8, 4>>, + + // MMX type gets 8 byte slot in stack , while alignment depends on target + CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>, + CCIfType<[x86mmx], CCAssignToStack<8, 4>>, + + // float 128 get stack slots whose size and alignment depends + // on the subtarget. + CCIfType<[f80, f128], CCAssignToStack<0, 0>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToStack<16, 16>>, + + // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>> +]>; + +def RetCC_#NAME : CallingConv<[ + // Promote i1, v8i1 arguments to i8. + CCIfType<[i1, v8i1], CCPromoteToType<i8>>, + + // Promote v16i1 arguments to i16. + CCIfType<[v16i1], CCPromoteToType<i16>>, + + // Promote v32i1 arguments to i32. + CCIfType<[v32i1], CCPromoteToType<i32>>, + + // bool, char, int, enum, long, pointer --> GPR + CCIfType<[i8], CCAssignToReg<RC.GPR_8>>, + CCIfType<[i16], CCAssignToReg<RC.GPR_16>>, + CCIfType<[i32], CCAssignToReg<RC.GPR_32>>, + + // long long, __int64 --> GPR + CCIfType<[i64], CCAssignToReg<RC.GPR_64>>, + + // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32) + CCIfType<[v64i1], CCPromoteToType<i64>>, + CCIfSubtarget<"is64Bit()", CCIfType<[i64], + CCAssignToReg<RC.GPR_64>>>, + CCIfSubtarget<"is32Bit()", CCIfType<[i64], + CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>, + + // long double --> FP + CCIfType<[f80], CCAssignToReg<RC.FP_RET>>, + + // float, double, float128 --> XMM + CCIfType<[f32, f64, f128], + CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>, + + // __m128, __m128i, __m128d --> XMM + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>, + + // __m256, __m256i, __m256d --> YMM + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>, + + // __m512, __m512i, __m512d --> ZMM + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", CCAssignToReg<RC.ZMM>>> +]>; +} + //===----------------------------------------------------------------------===// // Return Value Calling Conventions //===----------------------------------------------------------------------===// @@ -135,20 +308,12 @@ def RetCC_X86_32_HiPE : CallingConv<[ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> ]>; -// X86-32 HiPE return-value convention. +// X86-32 Vectorcall return-value convention. def RetCC_X86_32_VectorCall : CallingConv<[ - // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, f128], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, - // 256-bit FP vectors - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, - - // 512-bit FP vectors - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, - // Return integers in the standard way. CCDelegateTo<RetCC_X86Common> ]>; @@ -177,6 +342,16 @@ def RetCC_X86_Win64_C : CallingConv<[ CCDelegateTo<RetCC_X86_64_C> ]>; +// X86-64 vectorcall return-value convention. +def RetCC_X86_64_Vectorcall : CallingConv<[ + // Vectorcall calling convention always returns FP values in XMMs. + CCIfType<[f32, f64, f128], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // Otherwise, everything is the same as Windows X86-64 C CC. + CCDelegateTo<RetCC_X86_Win64_C> +]>; + // X86-64 HiPE return-value convention. def RetCC_X86_64_HiPE : CallingConv<[ // Promote all types to i64 @@ -196,6 +371,9 @@ def RetCC_X86_64_WebKit_JS : CallingConv<[ ]>; def RetCC_X86_64_Swift : CallingConv<[ + + CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, + // For integers, ECX, R8D can be used as extra return registers. CCIfType<[i1], CCPromoteToType<i8>>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>, @@ -234,6 +412,14 @@ def RetCC_X86_64_HHVM: CallingConv<[ RAX, R10, R11, R13, R14, R15]>> ]>; + +defm X86_32_RegCall : + X86_RegCall_base<RC_X86_32_RegCall>; +defm X86_Win64_RegCall : + X86_RegCall_base<RC_X86_64_RegCall_Win>; +defm X86_SysV64_RegCall : + X86_RegCall_base<RC_X86_64_RegCall_SysV>; + // This is the root return-value convention for the X86-32 backend. def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. @@ -241,6 +427,7 @@ def RetCC_X86_32 : CallingConv<[ // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_32_RegCall>>, // Otherwise, use RetCC_X86_32_C. CCDelegateTo<RetCC_X86_32_C> @@ -262,9 +449,17 @@ def RetCC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, + // Handle Vectorcall CC + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>, + // Handle HHVM calls. CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>, + CCIfCC<"CallingConv::X86_RegCall", + CCIfSubtarget<"isTargetWin64()", + CCDelegateTo<RetCC_X86_Win64_RegCall>>>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_SysV64_RegCall>>, + // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, @@ -436,18 +631,7 @@ def CC_X86_Win64_C : CallingConv<[ ]>; def CC_X86_Win64_VectorCall : CallingConv<[ - // The first 6 floating point and vector types of 128 bits or less use - // XMM0-XMM5. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, - - // 256-bit vectors use YMM registers. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, - - // 512-bit vectors use ZMM registers. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + CCCustom<"CC_X86_64_VectorCall">, // Delegate to fastcall to handle integer types. CCDelegateTo<CC_X86_Win64_C> @@ -657,25 +841,9 @@ def CC_X86_32_FastCall : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; -def CC_X86_32_VectorCall : CallingConv<[ - // The first 6 floating point and vector types of 128 bits or less use - // XMM0-XMM5. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, - - // 256-bit vectors use YMM registers. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, - - // 512-bit vectors use ZMM registers. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, - - // Otherwise, pass it indirectly. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, - v32i8, v16i16, v8i32, v4i64, v8f32, v4f64, - v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], - CCCustom<"CC_X86_32_VectorCallIndirect">>, +def CC_X86_Win32_VectorCall : CallingConv<[ + // Pass floating point in XMMs + CCCustom<"CC_X86_32_VectorCall">, // Delegate to fastcall to handle integer types. CCDelegateTo<CC_X86_32_FastCall> @@ -809,11 +977,12 @@ def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>, CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, - CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>, // Otherwise, drop to normal X86-32 CC CCDelegateTo<CC_X86_32_C> @@ -830,6 +999,9 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>, CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>, + CCIfCC<"CallingConv::X86_RegCall", + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>, CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>, // Mingw64 and native Win64 use Win64 CC @@ -860,7 +1032,9 @@ def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>; def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>; def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; -def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, +def CSR_Win64_NoSSE : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15)>; + +def CSR_Win64 : CalleeSavedRegs<(add CSR_Win64_NoSSE, (sequence "XMM%u", 6, 15))>; // The function used by Darwin to obtain the address of a thread-local variable @@ -931,3 +1105,17 @@ def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15, // Only R12 is preserved for PHP calls in HHVM. def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; + +// Register calling convention preserves few GPR and XMM8-15 +def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>; +def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, + (sequence "XMM%u", 4, 7))>; +def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP, + (sequence "R%u", 10, 15))>; +def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE, + (sequence "XMM%u", 8, 15))>; +def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP, + (sequence "R%u", 12, 15))>; +def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, + (sequence "XMM%u", 8, 15))>; + diff --git a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp new file mode 100755 index 0000000..bdd1ab5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp @@ -0,0 +1,213 @@ +//===----------------------- X86EvexToVex.cpp ----------------------------===// +// Compress EVEX instructions to VEX encoding when possible to reduce code size +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +/// \file +/// This file defines the pass that goes over all AVX-512 instructions which +/// are encoded using the EVEX prefix and if possible replaces them by their +/// corresponding VEX encoding which is usually shorter by 2 bytes. +/// EVEX instructions may be encoded via the VEX prefix when the AVX-512 +/// instruction has a corresponding AVX/AVX2 opcode and when it does not +/// use the xmm or the mask registers or xmm/ymm registers wuith indexes +/// higher than 15. +/// The pass applies code reduction on the generated code for AVX-512 instrs. +/// +//===---------------------------------------------------------------------===// + +#include "InstPrinter/X86InstComments.h" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86InstrTablesInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" + +using namespace llvm; + +#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible" +#define EVEX2VEX_NAME "x86-evex-to-vex-compress" + +#define DEBUG_TYPE EVEX2VEX_NAME + +namespace { + +class EvexToVexInstPass : public MachineFunctionPass { + + /// X86EvexToVexCompressTable - Evex to Vex encoding opcode map. + typedef DenseMap<unsigned, uint16_t> EvexToVexTableType; + EvexToVexTableType EvexToVex128Table; + EvexToVexTableType EvexToVex256Table; + + /// For EVEX instructions that can be encoded using VEX encoding, replace + /// them by the VEX encoding in order to reduce size. + bool CompressEvexToVexImpl(MachineInstr &MI) const; + + /// For initializing the hash map tables of all AVX-512 EVEX + /// corresponding to AVX/AVX2 opcodes. + void AddTableEntry(EvexToVexTableType &EvexToVexTable, uint16_t EvexOp, + uint16_t VexOp); + +public: + static char ID; + + StringRef getPassName() const override { return EVEX2VEX_DESC; } + + EvexToVexInstPass() : MachineFunctionPass(ID) { + initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry()); + + // Initialize the EVEX to VEX 128 table map. + for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex128CompressTable) { + AddTableEntry(EvexToVex128Table, Entry.EvexOpcode, Entry.VexOpcode); + } + + // Initialize the EVEX to VEX 256 table map. + for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex256CompressTable) { + AddTableEntry(EvexToVex256Table, Entry.EvexOpcode, Entry.VexOpcode); + } + } + + /// Loop over all of the basic blocks, replacing EVEX instructions + /// by equivalent VEX instructions when possible for reducing code size. + bool runOnMachineFunction(MachineFunction &MF) override; + + // This pass runs after regalloc and doesn't support VReg operands. + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + /// Machine instruction info used throughout the class. + const X86InstrInfo *TII; +}; + +char EvexToVexInstPass::ID = 0; +} + +INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false) + +FunctionPass *llvm::createX86EvexToVexInsts() { + return new EvexToVexInstPass(); +} + +bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); + + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); + if (!ST.hasAVX512()) + return false; + + bool Changed = false; + + /// Go over all basic blocks in function and replace + /// EVEX encoded instrs by VEX encoding when possible. + for (MachineBasicBlock &MBB : MF) { + + // Traverse the basic block. + for (MachineInstr &MI : MBB) + Changed |= CompressEvexToVexImpl(MI); + } + + return Changed; +} + +void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable, + uint16_t EvexOp, uint16_t VexOp) { + EvexToVexTable[EvexOp] = VexOp; +} + +// For EVEX instructions that can be encoded using VEX encoding +// replace them by the VEX encoding in order to reduce size. +bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { + + // VEX format. + // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1 + // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM] + // + // EVEX format. + // # of bytes: 4 1 1 1 4 / 1 1 + // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate] + + const MCInstrDesc &Desc = MI.getDesc(); + + // Check for EVEX instructions only. + if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX) + return false; + + // Check for EVEX instructions with mask or broadcast as in these cases + // the EVEX prefix is needed in order to carry this information + // thus preventing the transformation to VEX encoding. + if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B)) + return false; + + // Check for non EVEX_V512 instrs only. + // EVEX_V512 instr: bit EVEX_L2 = 1; bit VEX_L = 0. + if ((Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L)) + return false; + + // EVEX_V128 instr: bit EVEX_L2 = 0, bit VEX_L = 0. + bool IsEVEX_V128 = + (!(Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L)); + + // EVEX_V256 instr: bit EVEX_L2 = 0, bit VEX_L = 1. + bool IsEVEX_V256 = + (!(Desc.TSFlags & X86II::EVEX_L2) && (Desc.TSFlags & X86II::VEX_L)); + + unsigned NewOpc = 0; + + // Check for EVEX_V256 instructions. + if (IsEVEX_V256) { + // Search for opcode in the EvexToVex256 table. + auto It = EvexToVex256Table.find(MI.getOpcode()); + if (It != EvexToVex256Table.end()) + NewOpc = It->second; + } + + // Check for EVEX_V128 or Scalar instructions. + else if (IsEVEX_V128) { + // Search for opcode in the EvexToVex128 table. + auto It = EvexToVex128Table.find(MI.getOpcode()); + if (It != EvexToVex128Table.end()) + NewOpc = It->second; + } + + if (!NewOpc) + return false; + + auto isHiRegIdx = [](unsigned Reg) { + // Check for XMM register with indexes between 16 - 31. + if (Reg >= X86::XMM16 && Reg <= X86::XMM31) + return true; + + // Check for YMM register with indexes between 16 - 31. + if (Reg >= X86::YMM16 && Reg <= X86::YMM31) + return true; + + return false; + }; + + // Check that operands are not ZMM regs or + // XMM/YMM regs with hi indexes between 16 - 31. + for (const MachineOperand &MO : MI.explicit_operands()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + + assert (!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31)); + + if (isHiRegIdx(Reg)) + return false; + } + + const MCInstrDesc &MCID = TII->get(NewOpc); + MI.setDesc(MCID); + MI.setAsmPrinterFlag(AC_EVEX_2_VEX); + return true; +} diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 093fed7..985acf9 100644 --- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -51,10 +51,10 @@ public: MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( - MachineFunctionProperties::Property::AllVRegsAllocated); + MachineFunctionProperties::Property::NoVRegs); } - const char *getPassName() const override { + StringRef getPassName() const override { return "X86 pseudo instruction expansion pass"; } @@ -94,7 +94,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); // Incoporate the retaddr area. - Offset = StackAdj-MaxTCDelta; + Offset = StackAdj - MaxTCDelta; assert(Offset >= 0 && "Offset should never be negative"); if (Offset) { @@ -106,14 +106,22 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Jump to label or value in register. bool IsWin64 = STI->isTargetWin64(); if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) { - unsigned Op = (Opcode == X86::TCRETURNdi) - ? X86::TAILJMPd - : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64); + unsigned Op; + switch (Opcode) { + case X86::TCRETURNdi: + Op = X86::TAILJMPd; + break; + default: + // Note: Win64 uses REX prefixes indirect jumps out of functions, but + // not direct ones. + Op = X86::TAILJMPd64; + break; + } MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); - if (JumpTarget.isGlobal()) + if (JumpTarget.isGlobal()) { MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), JumpTarget.getTargetFlags()); - else { + } else { assert(JumpTarget.isSymbol()); MIB.addExternalSymbol(JumpTarget.getSymbolName(), JumpTarget.getTargetFlags()); diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index dfe3c80..c890fdd 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -170,6 +170,12 @@ private: const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB, X86AddressMode &AM); + + unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, unsigned Op0, + bool Op0IsKill, unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill, unsigned Op3, + bool Op3IsKill); }; } // end anonymous namespace. @@ -182,18 +188,18 @@ getX86ConditionCode(CmpInst::Predicate Predicate) { default: break; // Floating-point Predicates case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; - case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OGT: CC = X86::COND_A; break; - case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; - case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_ULT: CC = X86::COND_B; break; - case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; case CmpInst::FCMP_UNO: CC = X86::COND_P; break; case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; - case CmpInst::FCMP_OEQ: // fall-through + case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; // Integer Predicates @@ -229,15 +235,15 @@ getX86SSEConditionCode(CmpInst::Predicate Predicate) { switch (Predicate) { default: llvm_unreachable("Unexpected predicate"); case CmpInst::FCMP_OEQ: CC = 0; break; - case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OLT: CC = 1; break; - case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_OLE: CC = 2; break; case CmpInst::FCMP_UNO: CC = 3; break; case CmpInst::FCMP_UNE: CC = 4; break; - case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_UGE: CC = 5; break; - case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_UGT: CC = 6; break; case CmpInst::FCMP_ORD: CC = 7; break; case CmpInst::FCMP_UEQ: @@ -351,6 +357,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool HasSSE41 = Subtarget->hasSSE41(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); + bool HasAVX512 = Subtarget->hasAVX512(); + bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); // Get opcode and regclass of the output for the given load instruction. @@ -378,7 +386,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, break; case MVT::f32: if (X86ScalarSSEf32) { - Opc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; + Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; @@ -387,7 +395,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, break; case MVT::f64: if (X86ScalarSSEf64) { - Opc = HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; + Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; @@ -399,20 +407,26 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, return false; case MVT::v4f32: if (IsNonTemporal && Alignment >= 16 && HasSSE41) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; + Opc = HasVLX ? X86::VMOVAPSZ128rm : + HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; else - Opc = HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; + Opc = HasVLX ? X86::VMOVUPSZ128rm : + HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; RC = &X86::VR128RegClass; break; case MVT::v2f64: if (IsNonTemporal && Alignment >= 16 && HasSSE41) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; + Opc = HasVLX ? X86::VMOVAPDZ128rm : + HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; else - Opc = HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; + Opc = HasVLX ? X86::VMOVUPDZ128rm : + HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; RC = &X86::VR128RegClass; break; case MVT::v4i32: @@ -420,27 +434,34 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::v8i16: case MVT::v16i8: if (IsNonTemporal && Alignment >= 16) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; + Opc = HasVLX ? X86::VMOVDQA64Z128rm : + HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; else - Opc = HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; + Opc = HasVLX ? X86::VMOVDQU64Z128rm : + HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; RC = &X86::VR128RegClass; break; case MVT::v8f32: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) - Opc = X86::VMOVNTDQAYrm; + Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; else - Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm; + Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm; RC = &X86::VR256RegClass; break; case MVT::v4f64: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; else - Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm; + Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm; RC = &X86::VR256RegClass; break; case MVT::v8i32: @@ -450,12 +471,14 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; else - Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm; + Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm; RC = &X86::VR256RegClass; break; case MVT::v16f32: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else @@ -463,7 +486,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, RC = &X86::VR512RegClass; break; case MVT::v8f64: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else @@ -474,7 +497,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the load isn't masked. if (IsNonTemporal && Alignment >= 64) @@ -504,6 +527,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, bool HasSSE2 = Subtarget->hasSSE2(); bool HasSSE4A = Subtarget->hasSSE4A(); bool HasAVX = Subtarget->hasAVX(); + bool HasAVX512 = Subtarget->hasAVX512(); + bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); // Get opcode and regclass of the output for the given store instruction. @@ -518,8 +543,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, TII.get(X86::AND8ri), AndResult) .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); ValReg = AndResult; + LLVM_FALLTHROUGH; // handle i1 as i8. } - // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; case MVT::i16: Opc = X86::MOV16mr; break; case MVT::i32: @@ -534,7 +559,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSS; else - Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + Opc = HasAVX512 ? X86::VMOVSSZmr : + HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; } else Opc = X86::ST_Fp32m; break; @@ -543,27 +569,34 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSD; else - Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; + Opc = HasAVX512 ? X86::VMOVSDZmr : + HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; } else Opc = X86::ST_Fp64m; break; case MVT::v4f32: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; + Opc = HasVLX ? X86::VMOVNTPSZ128mr : + HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; else - Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; + Opc = HasVLX ? X86::VMOVAPSZ128mr : + HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; } else - Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; + Opc = HasVLX ? X86::VMOVUPSZ128mr : + HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; + Opc = HasVLX ? X86::VMOVNTPDZ128mr : + HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; else - Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; + Opc = HasVLX ? X86::VMOVAPDZ128mr : + HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; } else - Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; + Opc = HasVLX ? X86::VMOVUPDZ128mr : + HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: @@ -571,45 +604,57 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, case MVT::v16i8: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; + Opc = HasVLX ? X86::VMOVNTDQZ128mr : + HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; else - Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; + Opc = HasVLX ? X86::VMOVDQA64Z128mr : + HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; } else - Opc = HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr; + Opc = HasVLX ? X86::VMOVDQU64Z128mr : + HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr; break; case MVT::v8f32: assert(HasAVX); - if (Aligned) - Opc = IsNonTemporal ? X86::VMOVNTPSYmr : X86::VMOVAPSYmr; - else - Opc = X86::VMOVUPSYmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr; + else + Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr; + } else + Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr; break; case MVT::v4f64: assert(HasAVX); if (Aligned) { - Opc = IsNonTemporal ? X86::VMOVNTPDYmr : X86::VMOVAPDYmr; + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr; + else + Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr; } else - Opc = X86::VMOVUPDYmr; + Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr; break; case MVT::v8i32: case MVT::v4i64: case MVT::v16i16: case MVT::v32i8: assert(HasAVX); - if (Aligned) - Opc = IsNonTemporal ? X86::VMOVNTDQYmr : X86::VMOVDQAYmr; - else - Opc = X86::VMOVDQUYmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr; + else + Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr; + } else + Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr; break; case MVT::v16f32: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (Aligned) Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr; else Opc = X86::VMOVUPSZmr; break; case MVT::v8f64: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (Aligned) { Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr; } else @@ -619,7 +664,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the store isn't masked. if (Aligned) @@ -659,7 +704,9 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, bool Signed = true; switch (VT.getSimpleVT().SimpleTy) { default: break; - case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8. + case MVT::i1: + Signed = false; + LLVM_FALLTHROUGH; // Handle as i8. case MVT::i8: Opc = X86::MOV8mi; break; case MVT::i16: Opc = X86::MOV16mi; break; case MVT::i32: Opc = X86::MOV32mi; break; @@ -895,7 +942,7 @@ redo_gep: for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i, ++GTI) { const Value *Op = *i; - if (StructType *STy = dyn_cast<StructType>(*GTI)) { + if (StructType *STy = GTI.getStructTypeOrNull()) { const StructLayout *SL = DL.getStructLayout(STy); Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); continue; @@ -1454,11 +1501,11 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { } // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. - static unsigned SETFOpcTable[2][3] = { + static const uint16_t SETFOpcTable[2][3] = { { X86::SETEr, X86::SETNPr, X86::AND8rr }, { X86::SETNEr, X86::SETPr, X86::OR8rr } }; - unsigned *SETFOpc = nullptr; + const uint16_t *SETFOpc = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; @@ -1511,7 +1558,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { // Handle zero-extension from i1 to i8, which is common. MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); - if (SrcVT.SimpleTy == MVT::i1) { + if (SrcVT == MVT::i1) { // Set the high bits to zero. ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; @@ -1601,7 +1648,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: - std::swap(TrueMBB, FalseMBB); // fall-through + std::swap(TrueMBB, FalseMBB); + LLVM_FALLTHROUGH; case CmpInst::FCMP_UNE: NeedExtraBranch = true; Predicate = CmpInst::FCMP_ONE; @@ -1651,6 +1699,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { if (TestOpc) { unsigned OpReg = getRegForValue(TI->getOperand(0)); if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) .addReg(OpReg).addImm(1); @@ -1688,8 +1737,17 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { unsigned OpReg = getRegForValue(BI->getCondition()); if (OpReg == 0) return false; + // In case OpReg is a K register, COPY to a GPR + if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) { + unsigned KOpReg = OpReg; + OpReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), OpReg) + .addReg(KOpReg); + } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(OpReg).addImm(1); + .addReg(OpReg) + .addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); @@ -1875,15 +1933,15 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { // Copy the zero into the appropriate sub/super/identical physical // register. Unfortunately the operations needed are not uniform enough // to fit neatly into the table above. - if (VT.SimpleTy == MVT::i16) { + if (VT == MVT::i16) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) .addReg(Zero32, 0, X86::sub_16bit); - } else if (VT.SimpleTy == MVT::i32) { + } else if (VT == MVT::i32) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) .addReg(Zero32); - } else if (VT.SimpleTy == MVT::i64) { + } else if (VT == MVT::i64) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); @@ -1953,11 +2011,11 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. - static unsigned SETFOpcTable[2][3] = { + static const uint16_t SETFOpcTable[2][3] = { { X86::SETNPr, X86::SETEr , X86::TEST8rr }, { X86::SETPr, X86::SETNEr, X86::OR8rr } }; - unsigned *SETFOpc = nullptr; + const uint16_t *SETFOpc = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: @@ -2023,8 +2081,17 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return false; bool CondIsKill = hasTrivialKill(Cond); + // In case OpReg is a K register, COPY to a GPR + if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { + unsigned KCondReg = CondReg; + CondReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CondReg) + .addReg(KCondReg, getKillRegState(CondIsKill)); + } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + .addReg(CondReg, getKillRegState(CondIsKill)) + .addImm(1); } const Value *LHS = I->getOperand(1); @@ -2087,12 +2154,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { std::swap(CmpLHS, CmpRHS); // Choose the SSE instruction sequence based on data type (float or double). - static unsigned OpcTable[2][4] = { - { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, - { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr } + static const uint16_t OpcTable[2][4] = { + { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr }, + { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr } }; - unsigned *Opc = nullptr; + const uint16_t *Opc = nullptr; switch (RetVT.SimpleTy) { default: return false; case MVT::f32: Opc = &OpcTable[0][0]; break; @@ -2119,9 +2186,36 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg; - - if (Subtarget->hasAVX()) { - const TargetRegisterClass *FR32 = &X86::FR32RegClass; + + if (Subtarget->hasAVX512()) { + // If we have AVX512 we can use a mask compare and masked movss/sd. + const TargetRegisterClass *VR128X = &X86::VR128XRegClass; + const TargetRegisterClass *VK1 = &X86::VK1RegClass; + + unsigned CmpOpcode = + (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr; + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + + // Need an IMPLICIT_DEF for the input that is used to generate the upper + // bits of the result register since its not based on any of the inputs. + unsigned ImplicitDefReg = createResultReg(VR128X); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + + // Place RHSReg is the passthru of the masked movss/sd operation and put + // LHS in the input. The mask input comes from the compare. + unsigned MovOpcode = + (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk; + unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill, + CmpReg, true, ImplicitDefReg, true, + LHSReg, LHSIsKill); + + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg); + + } else if (Subtarget->hasAVX()) { const TargetRegisterClass *VR128 = &X86::VR128RegClass; // If we have AVX, create 1 blendv instead of 3 logic instructions. @@ -2130,11 +2224,11 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. unsigned CmpOpcode = - (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; + (RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; unsigned BlendOpcode = - (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; - - unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill, + (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; + + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CmpReg, true); @@ -2142,14 +2236,18 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { + const TargetRegisterClass *VR128 = &X86::VR128RegClass; unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false, LHSReg, LHSIsKill); - unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true, RHSReg, RHSIsKill); - ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, - AndReg, /*IsKill=*/true); + unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg); } updateValueMap(I, ResultReg); return true; @@ -2195,8 +2293,18 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { if (CondReg == 0) return false; bool CondIsKill = hasTrivialKill(Cond); + + // In case OpReg is a K register, COPY to a GPR + if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { + unsigned KCondReg = CondReg; + CondReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CondReg) + .addReg(KCondReg, getKillRegState(CondIsKill)); + } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) - .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + .addReg(CondReg, getKillRegState(CondIsKill)) + .addImm(1); } const Value *LHS = I->getOperand(1); @@ -2522,8 +2630,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // This needs to be set before we call getPtrSizedFrameRegister, otherwise // we get the wrong frame register. - MachineFrameInfo *MFI = MF->getFrameInfo(); - MFI->setFrameAddressIsTaken(true); + MachineFrameInfo &MFI = MF->getFrameInfo(); + MFI.setFrameAddressIsTaken(true); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF); @@ -2698,7 +2806,9 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { const Function *Callee = II->getCalledFunction(); auto *Ty = cast<StructType>(Callee->getReturnType()); Type *RetTy = Ty->getTypeAtIndex(0U); - Type *CondTy = Ty->getTypeAtIndex(1); + assert(Ty->getTypeAtIndex(1)->isIntegerTy() && + Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 && + "Overflow value expected to be an i1"); MVT VT; if (!isTypeLegal(RetTy, VT)) @@ -2808,7 +2918,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { if (!ResultReg) return false; - unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); + // Assign to a GPR since the overflow return value is lowered to a SETcc. + unsigned ResultReg2 = createResultReg(&X86::GR8RegClass); assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), ResultReg2); @@ -2966,7 +3077,7 @@ bool X86FastISel::fastLowerArguments() { default: llvm_unreachable("Unexpected value type."); case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; - case MVT::f32: // fall-through + case MVT::f32: LLVM_FALLTHROUGH; case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; } unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); @@ -3140,7 +3251,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); - if (ArgVT.SimpleTy == MVT::i1) + if (ArgVT == MVT::i1) return false; bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, @@ -3154,7 +3265,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { "Unexpected extend"); // Handle zero-extension from i1 to i8, which is common. - if (ArgVT.SimpleTy == MVT::i1) { + if (ArgVT == MVT::i1) { // Set the high bits to zero. ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false); ArgVT = MVT::i8; @@ -3456,8 +3567,14 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { if (!SrcVT.isSimple() || !DstVT.isSimple()) return false; - if (!SrcVT.is128BitVector() && - !(Subtarget->hasAVX() && SrcVT.is256BitVector())) + MVT SVT = SrcVT.getSimpleVT(); + MVT DVT = DstVT.getSimpleVT(); + + if (!SVT.is128BitVector() && + !(Subtarget->hasAVX() && SVT.is256BitVector()) && + !(Subtarget->hasAVX512() && SVT.is512BitVector() && + (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 && + DVT.getScalarSizeInBits() >= 32)))) return false; unsigned Reg = getRegForValue(I->getOperand(0)); @@ -3505,7 +3622,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { unsigned Opc = 0; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); - case MVT::i1: VT = MVT::i8; // fall-through + case MVT::i1: VT = MVT::i8; LLVM_FALLTHROUGH; case MVT::i8: Opc = X86::MOV8ri; break; case MVT::i16: Opc = X86::MOV16ri; break; case MVT::i32: Opc = X86::MOV32ri; break; @@ -3775,6 +3892,38 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, return true; } +unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill, + unsigned Op2, bool Op2IsKill, + unsigned Op3, bool Op3IsKill) { + const MCInstrDesc &II = TII.get(MachineInstOpcode); + + unsigned ResultReg = createResultReg(RC); + Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); + Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1); + Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2); + Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 3); + + if (II.getNumDefs() >= 1) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addReg(Op1, getKillRegState(Op1IsKill)) + .addReg(Op2, getKillRegState(Op2IsKill)) + .addReg(Op3, getKillRegState(Op3IsKill)); + else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addReg(Op1, getKillRegState(Op1IsKill)) + .addReg(Op2, getKillRegState(Op2IsKill)) + .addReg(Op3, getKillRegState(Op3IsKill)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]); + } + return ResultReg; +} + namespace llvm { FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, diff --git a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp index 90e758d..8bde4bf 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -66,8 +66,6 @@ using namespace llvm; #define DEBUG_TYPE FIXUPBW_NAME // Option to allow this optimization pass to have fine-grained control. -// This is turned off by default so as not to affect a large number of -// existing lit tests. static cl::opt<bool> FixupBWInsts("fixup-byte-word-insts", cl::desc("Change byte and word instructions to larger sizes"), @@ -104,9 +102,7 @@ class FixupBWInstPass : public MachineFunctionPass { public: static char ID; - const char *getPassName() const override { - return FIXUPBW_DESC; - } + StringRef getPassName() const override { return FIXUPBW_DESC; } FixupBWInstPass() : MachineFunctionPass(ID) { initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry()); @@ -125,7 +121,7 @@ public: MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( - MachineFunctionProperties::Property::AllVRegsAllocated); + MachineFunctionProperties::Property::NoVRegs); } private: @@ -158,7 +154,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); OptForSize = MF.getFunction()->optForSize(); MLI = &getAnalysis<MachineLoopInfo>(); - LiveRegs.init(&TII->getRegisterInfo()); + LiveRegs.init(TII->getRegisterInfo()); DEBUG(dbgs() << "Start X86FixupBWInsts\n";); diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index 013ee24..1209591 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -40,7 +40,7 @@ class FixupLEAPass : public MachineFunctionPass { /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - const char *getPassName() const override { return "X86 LEA Fixup"; } + StringRef getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, @@ -95,7 +95,7 @@ public: // This pass runs after regalloc and doesn't support VReg operands. MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( - MachineFunctionProperties::Property::AllVRegsAllocated); + MachineFunctionProperties::Property::NoVRegs); } private: diff --git a/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp index fb317da..a86eb99 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp @@ -39,7 +39,7 @@ class X86FixupSetCCPass : public MachineFunctionPass { public: X86FixupSetCCPass() : MachineFunctionPass(ID) {} - const char *getPassName() const override { return "X86 Fixup SetCC"; } + StringRef getPassName() const override { return "X86 Fixup SetCC"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -99,7 +99,8 @@ bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) { MachineInstr * X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB, MachineBasicBlock::reverse_iterator MI) { - auto MBBStart = MBB->instr_rend(); + // FIXME: Should this be instr_rend(), and MI be reverse_instr_iterator? + auto MBBStart = MBB->rend(); for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI) for (auto &Op : MI->implicit_operands()) if ((Op.getReg() == X86::EFLAGS) && (Op.isDef())) diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index 55c1bff..a5489b9 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -78,10 +78,10 @@ namespace { MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( - MachineFunctionProperties::Property::AllVRegsAllocated); + MachineFunctionProperties::Property::NoVRegs); } - const char *getPassName() const override { return "X86 FP Stackifier"; } + StringRef getPassName() const override { return "X86 FP Stackifier"; } private: const TargetInstrInfo *TII; // Machine instruction info. @@ -206,6 +206,13 @@ namespace { RegMap[Reg] = StackTop++; } + // popReg - Pop a register from the stack. + void popReg() { + if (StackTop == 0) + report_fatal_error("Cannot pop empty stack!"); + RegMap[Stack[--StackTop]] = ~0; // Update state + } + bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; } void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) { DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); @@ -326,9 +333,28 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Process the function in depth first order so that we process at least one // of the predecessors for every reachable block in the function. - SmallPtrSet<MachineBasicBlock*, 8> Processed; + df_iterator_default_set<MachineBasicBlock*> Processed; MachineBasicBlock *Entry = &MF.front(); + LiveBundle &Bundle = + LiveBundles[Bundles->getBundle(Entry->getNumber(), false)]; + + // In regcall convention, some FP registers may not be passed through + // the stack, so they will need to be assigned to the stack first + if ((Entry->getParent()->getFunction()->getCallingConv() == + CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) { + // In the register calling convention, up to one FP argument could be + // saved in the first FP register. + // If bundle.mask is non-zero and Bundle.FixCount is zero, it means + // that the FP registers contain arguments. + // The actual value is passed in FP0. + // Here we fix the stack and mark FP0 as pre-assigned register. + assert((Bundle.Mask & 0xFE) == 0 && + "Only FP0 could be passed as an argument"); + Bundle.FixCount = 1; + Bundle.FixStack[0] = 0; + } + bool Changed = false; for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed)) Changed |= processBasicBlock(MF, *BB); @@ -791,9 +817,8 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) { MachineInstr &MI = *I; const DebugLoc &dl = MI.getDebugLoc(); ASSERT_SORTED(PopTable); - if (StackTop == 0) - report_fatal_error("Cannot pop empty stack!"); - RegMap[Stack[--StackTop]] = ~0; // Update state + + popReg(); // Check to see if there is a popping version of this instruction... int Opcode = Lookup(PopTable, I->getOpcode()); @@ -929,6 +954,7 @@ void FPS::shuffleStackTop(const unsigned char *FixStack, void FPS::handleCall(MachineBasicBlock::iterator &I) { unsigned STReturns = 0; + const MachineFunction* MF = I->getParent()->getParent(); for (const auto &MO : I->operands()) { if (!MO.isReg()) @@ -937,7 +963,10 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) { unsigned R = MO.getReg() - X86::FP0; if (R < 8) { - assert(MO.isDef() && MO.isImplicit()); + if (MF->getFunction()->getCallingConv() != CallingConv::X86_RegCall) { + assert(MO.isDef() && MO.isImplicit()); + } + STReturns |= 1 << R; } } @@ -945,9 +974,15 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) { unsigned N = countTrailingOnes(STReturns); // FP registers used for function return must be consecutive starting at - // FP0. + // FP0 assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2)); + // Reset the FP Stack - It is required because of possible leftovers from + // passed arguments. The caller should assume that the FP stack is + // returned empty (unless the callee returns values on FP stack). + while (StackTop > 0) + popReg(); + for (unsigned I = 0; I < N; ++I) pushReg(N - I - 1); } diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 03d9256..cd69044 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -50,7 +50,7 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, } bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { - return !MF.getFrameInfo()->hasVarSizedObjects() && + return !MF.getFrameInfo().hasVarSizedObjects() && !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); } @@ -74,7 +74,7 @@ X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { // when there are no stack objects. bool X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { - return MF.getFrameInfo()->hasStackObjects() || + return MF.getFrameInfo().hasStackObjects() || MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); } @@ -82,17 +82,15 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. bool X86FrameLowering::hasFP(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const MachineModuleInfo &MMI = MF.getMMI(); - + const MachineFrameInfo &MFI = MF.getFrameInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || TRI->needsStackRealignment(MF) || - MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || + MFI.hasVarSizedObjects() || + MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || - MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() || - MFI->hasStackMap() || MFI->hasPatchPoint() || - MFI->hasCopyImplyingStackAdjustment()); + MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() || + MFI.hasStackMap() || MFI.hasPatchPoint() || + MFI.hasCopyImplyingStackAdjustment()); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { @@ -151,13 +149,15 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, bool Is64Bit) { const MachineFunction *MF = MBB.getParent(); const Function *F = MF->getFunction(); - if (!F || MF->getMMI().callsEHReturn()) + if (!F || MF->callsEHReturn()) return 0; const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF); - unsigned Opc = MBBI->getOpcode(); - switch (Opc) { + if (MBBI == MBB.end()) + return 0; + + switch (MBBI->getOpcode()) { default: return 0; case TargetOpcode::PATCHABLE_RET: case X86::RET: @@ -373,6 +373,10 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr : std::next(MBBI); + PI = skipDebugInstructionsBackward(PI, MBB.begin()); + if (NI != nullptr) + NI = skipDebugInstructionsForward(NI, MBB.end()); + unsigned Opc = PI->getOpcode(); int Offset = 0; @@ -416,7 +420,7 @@ void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, const DebugLoc &DL, const MCCFIInstruction &CFIInst) const { MachineFunction &MF = *MBB.getParent(); - unsigned CFIIndex = MF.getMMI().addFrameInst(CFIInst); + unsigned CFIIndex = MF.addFrameInst(CFIInst); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } @@ -425,18 +429,18 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const { MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); // Add callee saved registers to move list. - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); if (CSI.empty()) return; // Calculate offsets. for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { - int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); + int64_t Offset = MFI.getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); @@ -445,20 +449,19 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( } } -MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, - bool InProlog) const { +void X86FrameLowering::emitStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); if (STI.isTargetWindowsCoreCLR()) { if (InProlog) { - return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); } else { - return emitStackProbeInline(MF, MBB, MBBI, DL, false); + emitStackProbeInline(MF, MBB, MBBI, DL, false); } } else { - return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); + emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); } } @@ -479,17 +482,19 @@ void X86FrameLowering::inlineStackProbe(MachineFunction &MF, assert(!ChkStkStub->isBundled() && "Not expecting bundled instructions here"); MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator()); - assert(std::prev(MBBI).operator==(ChkStkStub) && - "MBBI expected after __chkstk_stub."); + assert(std::prev(MBBI) == ChkStkStub && + "MBBI expected after __chkstk_stub."); DebugLoc DL = PrologMBB.findDebugLoc(MBBI); emitStackProbeInline(MF, PrologMBB, MBBI, DL, true); ChkStkStub->eraseFromParent(); } } -MachineInstr *X86FrameLowering::emitStackProbeInline( - MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { +void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); assert(STI.is64Bit() && "different expansion needed for 32 bit"); assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); @@ -612,7 +617,7 @@ MachineInstr *X86FrameLowering::emitStackProbeInline( // lowest touched page on the stack, not the point at which the OS // will cause an overflow exception, so this is just an optimization // to avoid unnecessarily touching pages that are below the current - // SP but already commited to the stack by the OS. + // SP but already committed to the stack by the OS. BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg) .addReg(0) .addImm(1) @@ -699,13 +704,13 @@ MachineInstr *X86FrameLowering::emitStackProbeInline( } // Possible TODO: physreg liveness for InProlog case. - - return &*ContinueMBBI; } -MachineInstr *X86FrameLowering::emitStackProbeCall( - MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { +void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; unsigned CallOp; @@ -763,11 +768,9 @@ MachineInstr *X86FrameLowering::emitStackProbeCall( for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI) ExpansionMBBI->setFlag(MachineInstr::FrameSetup); } - - return &*MBBI; } -MachineInstr *X86FrameLowering::emitStackProbeInlineStub( +void X86FrameLowering::emitStackProbeInlineStub( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { @@ -775,8 +778,6 @@ MachineInstr *X86FrameLowering::emitStackProbeInlineStub( BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__chkstk_stub"); - - return &*MBBI; } static unsigned calculateSetFPREG(uint64_t SPAdjust) { @@ -793,11 +794,11 @@ static unsigned calculateSetFPREG(uint64_t SPAdjust) { // have a call out. Otherwise just make sure we have some alignment - we'll // go with the minimum SlotSize. uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment. unsigned StackAlign = getStackAlignment(); if (MF.getFunction()->hasFnAttribute("stackrealign")) { - if (MFI->hasCalls()) + if (MFI.hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) MaxAlign = SlotSize; @@ -909,18 +910,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, assert(&STI == &MF.getSubtarget<X86Subtarget>() && "MF used frame lowering for wrong subtarget"); MachineBasicBlock::iterator MBBI = MBB.begin(); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. - uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. + uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate. bool IsFunclet = MBB.isEHFuncletEntry(); EHPersonality Personality = EHPersonality::Unknown; if (Fn->hasPersonalityFn()) Personality = classifyEHPersonality(Fn->getPersonalityFn()); bool FnHasClrFunclet = - MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR; + MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR; bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv()); @@ -933,6 +934,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, STI.isTarget64BitILP32() ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; unsigned BasePtr = TRI->getBaseRegister(); + bool HasWinCFI = false; // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. @@ -964,16 +966,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // push and pop from the stack. if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && !TRI->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64CC && // Win64 has no Red Zone - !MFI->hasCopyImplyingStackAdjustment() && // Don't push and pop. - !MF.shouldSplitStack()) { // Regular stack + !MFI.hasVarSizedObjects() && // No dynamic alloca. + !MFI.adjustsStack() && // No calls. + !IsWin64CC && // Win64 has no Red Zone + !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0); StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); - MFI->setStackSize(StackSize); + MFI.setStackSize(StackSize); } // Insert stack pointer adjustment for later moving of return addr. Only @@ -1037,9 +1039,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. // Update the frame offset adjustment. if (!IsFunclet) - MFI->setOffsetAdjustment(-NumBytes); + MFI.setOffsetAdjustment(-NumBytes); else - assert(MFI->getOffsetAdjustment() == -(int)NumBytes && + assert(MFI.getOffsetAdjustment() == -(int)NumBytes && "should calculate same local variable offset for funclets"); // Save EBP/RBP into the appropriate stack slot. @@ -1061,6 +1063,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(FramePtr) .setMIFlag(MachineInstr::FrameSetup); @@ -1122,6 +1125,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } if (NeedsWinCFI) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( MachineInstr::FrameSetup); } @@ -1207,10 +1211,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); } - if (NeedsWinCFI && NumBytes) + if (NeedsWinCFI && NumBytes) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); + } int SEHFrameOffset = 0; unsigned SPOrEstablisher; @@ -1257,6 +1263,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // If this is not a funclet, emit the CFI describing our frame pointer. if (NeedsWinCFI && !IsFunclet) { + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) @@ -1293,6 +1300,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); Offset += SEHFrameOffset; + HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) .addImm(Reg) .addImm(Offset) @@ -1302,7 +1310,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } } - if (NeedsWinCFI) + if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); @@ -1394,13 +1402,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (Fn->getCallingConv() == CallingConv::X86_INTR) BuildMI(MBB, MBBI, DL, TII.get(X86::CLD)) .setMIFlag(MachineInstr::FrameSetup); + + // At this point we know if the function has WinCFI or not. + MF.setHasWinCFI(HasWinCFI); } bool X86FrameLowering::canUseLEAForSPInEpilogue( const MachineFunction &MF) const { - // We can't use LEA instructions for adjusting the stack pointer if this is a - // leaf function in the Win64 ABI. Only ADD instructions may be used to - // deallocate the stack. + // We can't use LEA instructions for adjusting the stack pointer if we don't + // have a frame pointer in the Win64 ABI. Only ADD instructions may be used + // to deallocate the stack. // This means that we can use LEA for SP in two situations: // 1. We *aren't* using the Win64 ABI which means we are free to use LEA. // 2. We *have* a frame pointer which means we are permitted to use LEA. @@ -1457,7 +1468,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize; } else { // Other funclets just need enough stack for outgoing call arguments. - UsedSize = MF.getFrameInfo()->getMaxCallFrameSize(); + UsedSize = MF.getFrameInfo().getMaxCallFrameSize(); } // RBP is not included in the callee saved register block. After pushing RBP, // everything is 16 byte aligned. Everything we allocate before an outgoing @@ -1477,10 +1488,12 @@ static bool isTailCallOpcode(unsigned Opc) { void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - unsigned RetOpcode = MBBI->getOpcode(); + Optional<unsigned> RetOpcode; + if (MBBI != MBB.end()) + RetOpcode = MBBI->getOpcode(); DebugLoc DL; if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); @@ -1493,16 +1506,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinCFI = IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry(); - bool IsFunclet = isFuncletReturnInstr(*MBBI); + bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI); MachineBasicBlock *TargetMBB = nullptr; // Get the number of bytes to allocate from the FrameInfo. - uint64_t StackSize = MFI->getStackSize(); + uint64_t StackSize = MFI.getStackSize(); uint64_t MaxAlign = calculateMaxStackAlign(MF); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; - if (MBBI->getOpcode() == X86::CATCHRET) { + if (RetOpcode && *RetOpcode == X86::CATCHRET) { // SEH shouldn't use catchret. assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF.getFunction()->getPersonalityFn())) && @@ -1516,7 +1529,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); - } else if (MBBI->getOpcode() == X86::CLEANUPRET) { + } else if (RetOpcode && *RetOpcode == X86::CLEANUPRET) { NumBytes = getWinEHFuncletFrameSize(MF); assert(hasFP(MF) && "EH funclets without FP not yet implemented"); BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), @@ -1541,19 +1554,22 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } uint64_t SEHStackAllocAmt = NumBytes; + MachineBasicBlock::iterator FirstCSPop = MBBI; // Skip the callee-saved pop instructions. while (MBBI != MBB.begin()) { MachineBasicBlock::iterator PI = std::prev(MBBI); unsigned Opc = PI->getOpcode(); - if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && - (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) && - Opc != X86::DBG_VALUE && !PI->isTerminator()) - break; + if (Opc != X86::DBG_VALUE && !PI->isTerminator()) { + if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && + (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy))) + break; + FirstCSPop = PI; + } --MBBI; } - MachineBasicBlock::iterator FirstCSPop = MBBI; + MBBI = FirstCSPop; if (TargetMBB) { // Fill EAX/RAX with the address of the target block. @@ -1581,14 +1597,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. - if (NumBytes || MFI->hasVarSizedObjects()) + if (NumBytes || MFI.hasVarSizedObjects()) NumBytes += mergeSPUpdates(MBB, MBBI, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was // realigned. Don't do this if this was a funclet epilogue, since the funclets // will not do realignment or dynamic stack allocation. - if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) && + if ((TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) && !IsFunclet) { if (TRI->needsStackRealignment(MF)) MBBI = FirstCSPop; @@ -1626,10 +1642,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // into the epilogue. To cope with that, we insert an epilogue marker here, // then replace it with a 'nop' if it ends up immediately after a CALL in the // final emitted code. - if (NeedsWinCFI) + if (NeedsWinCFI && MF.hasWinCFI()) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); - if (!isTailCallOpcode(RetOpcode)) { + if (!RetOpcode || !isTailCallOpcode(*RetOpcode)) { // Add the return addr area delta back since we are not tail calling. int Offset = -1 * X86FI->getTCReturnAddrDelta(); assert(Offset >= 0 && "TCDelta should never be positive"); @@ -1649,7 +1665,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // (probably?) it should be moved into here. int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we @@ -1665,16 +1681,16 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, // object. // We need to factor in additional offsets applied during the prologue to the // frame, base, and stack pointer depending on which is used. - int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); - uint64_t StackSize = MFI->getStackSize(); + uint64_t StackSize = MFI.getStackSize(); bool HasFP = hasFP(MF); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int64_t FPDelta = 0; if (IsWin64Prologue) { - assert(!MFI->hasCalls() || (StackSize % 16) == 8); + assert(!MFI.hasCalls() || (StackSize % 16) == 8); // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; @@ -1692,7 +1708,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, // restricted Win64 prologue. // Add FPDelta to all offsets below that go through the frame pointer. FPDelta = FrameSize - SEHFrameOffset; - assert((!MFI->hasCalls() || (FPDelta % 16) == 0) && + assert((!MFI.hasCalls() || (FPDelta % 16) == 0) && "FPDelta isn't aligned per the Win64 ABI!"); } @@ -1703,7 +1719,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, // Skip the saved EBP. return Offset + SlotSize + FPDelta; } else { - assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0); return Offset + StackSize; } } else if (TRI->needsStackRealignment(MF)) { @@ -1711,7 +1727,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, // Skip the saved EBP. return Offset + SlotSize + FPDelta; } else { - assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0); return Offset + StackSize; } // FIXME: Support tail calls @@ -1736,9 +1752,9 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, unsigned &FrameReg, bool IgnoreSPUpdates) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); // Does not include any dynamic realign. - const uint64_t StackSize = MFI->getStackSize(); + const uint64_t StackSize = MFI.getStackSize(); // LLVM arranges the stack as follows: // ... // ARG2 @@ -1772,7 +1788,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, // answer we give is relative to the SP after the prologue, and not the // SP in the middle of the function. - if (MFI->isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) && + if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) && !STI.isTargetWin64()) return getFrameIndexReference(MF, FI, FrameReg); @@ -1804,7 +1820,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, // // A is the incoming stack pointer. // (B - A) is the local area offset (-8 for x86-64) [1] - // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] + // (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2] // // |(E - B)| is the StackSize (absolute value, positive). For a // stack that grown down, this works out to be (B - E). [3] @@ -1817,7 +1833,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, // // Get the Offset from the StackPointer - int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); return Offset + StackSize; } @@ -1825,7 +1841,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const { - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CalleeSavedFrameSize = 0; @@ -1834,7 +1850,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( if (hasFP(MF)) { // emitPrologue always spills frame register the first thing. SpillSlotOffset -= SlotSize; - MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); // Since emitPrologue and emitEpilogue will handle spilling and restoring of // the frame register, we can delete it from CSI list and not have to worry @@ -1858,7 +1874,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( SpillSlotOffset -= SlotSize; CalleeSavedFrameSize += SlotSize; - int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); } @@ -1876,9 +1892,9 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // spill into slot SpillSlotOffset -= RC->getSize(); int SlotIndex = - MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + MFI.CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); - MFI->ensureMaxAlignment(RC->getAlignment()); + MFI.ensureMaxAlignment(RC->getAlignment()); } return true; @@ -1957,7 +1973,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (CSI.empty()) return false; - if (isFuncletReturnInstr(*MI) && STI.isOSWindows()) { + if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) { // Don't restore CSRs in 32-bit EH funclets. Matches // spillCalleeSavedRegisters. if (STI.is32Bit()) @@ -2005,7 +2021,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); @@ -2020,7 +2036,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, // ... // } // [EBP] - MFI->CreateFixedObject(-TailCallReturnAddrDelta, + MFI.CreateFixedObject(-TailCallReturnAddrDelta, TailCallReturnAddrDelta - SlotSize, true); } @@ -2029,8 +2045,8 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(TRI->getBaseRegister()); // Allocate a spill slot for EBP if we have a base pointer and EH funclets. - if (MF.getMMI().hasEHFunclets()) { - int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize); + if (MF.hasEHFunclets()) { + int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize); X86FI->setHasSEHFramePtrSave(true); X86FI->setSEHFramePtrSaveIndex(FI); } @@ -2091,7 +2107,7 @@ static const uint64_t kSplitStackAvailable = 256; void X86FrameLowering::adjustForSegmentedStacks( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); uint64_t StackSize; unsigned TlsReg, TlsOffset; DebugLoc DL; @@ -2114,7 +2130,7 @@ void X86FrameLowering::adjustForSegmentedStacks( // Eventually StackSize will be calculated by a link-time pass; which will // also decide whether checking code needs to be injected into this particular // prologue. - StackSize = MFI->getStackSize(); + StackSize = MFI.getStackSize(); // Do not generate a prologue for functions with a stack of size zero if (StackSize == 0) @@ -2360,7 +2376,7 @@ static unsigned getHiPELiteral( /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue( MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); DebugLoc DL; // To support shrink-wrapping we would need to insert the new blocks @@ -2380,7 +2396,7 @@ void X86FrameLowering::adjustForHiPEPrologue( const unsigned Guaranteed = HipeLeafWords * SlotSize; unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ? MF.getFunction()->arg_size() - CCRegisteredArgs : 0; - unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize; + unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize; assert(STI.isTargetLinux() && "HiPE prologue is only supported on Linux operating systems."); @@ -2392,7 +2408,7 @@ void X86FrameLowering::adjustForHiPEPrologue( // b) outgoing on-stack parameter areas, and // c) the minimum stack space this function needs to make available for the // functions it calls (a tunable ABI property). - if (MFI->hasCalls()) { + if (MFI.hasCalls()) { unsigned MoreStackForCalls = 0; for (auto &MBB : MF) { @@ -2574,6 +2590,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; I = MBB.erase(I); + auto InsertPos = skipDebugInstructionsForward(I, MBB.end()); if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the @@ -2599,12 +2616,11 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // GNU_ARGS_SIZE. // TODO: We don't need to reset this between subsequent functions, // if it didn't change. - bool HasDwarfEHHandlers = !WindowsCFI && - !MF.getMMI().getLandingPads().empty(); + bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty(); if (HasDwarfEHHandlers && !isDestroy && MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences()) - BuildCFI(MBB, I, DL, + BuildCFI(MBB, InsertPos, DL, MCCFIInstruction::createGnuArgsSize(nullptr, Amount)); if (Amount == 0) @@ -2618,7 +2634,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // If this is a callee-pop calling convention, emit a CFA adjust for // the amount the callee popped. if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF)) - BuildCFI(MBB, I, DL, + BuildCFI(MBB, InsertPos, DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt)); // Add Amount to SP to destroy a frame, or subtract to setup. @@ -2629,13 +2645,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // Merge with any previous or following adjustment instruction. Note: the // instructions merged with here do not have CFI, so their stack // adjustments do not feed into CfaAdjustment. - StackAdjustment += mergeSPUpdates(MBB, I, true); - StackAdjustment += mergeSPUpdates(MBB, I, false); + StackAdjustment += mergeSPUpdates(MBB, InsertPos, true); + StackAdjustment += mergeSPUpdates(MBB, InsertPos, false); if (StackAdjustment) { if (!(Fn->optForMinSize() && - adjustStackWithPops(MBB, I, DL, StackAdjustment))) - BuildStackAdjustment(MBB, I, DL, StackAdjustment, + adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment))) + BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment, /*InEpilogue=*/false); } } @@ -2651,8 +2667,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // TODO: When not using precise CFA, we also need to adjust for the // InternalAmt here. if (CfaAdjustment) { - BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset( - nullptr, CfaAdjustment)); + BuildCFI(MBB, InsertPos, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, + CfaAdjustment)); } } @@ -2728,12 +2745,12 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( unsigned BasePtr = TRI->getBaseRegister(); WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); // FIXME: Don't set FrameSetup flag in catchret case. int FI = FuncInfo.EHRegNodeFrameIndex; - int EHRegSize = MFI->getObjectSize(FI); + int EHRegSize = MFI.getObjectSize(FI); if (RestoreSP) { // MOV32rm -EHRegSize(%ebp), %esp @@ -2850,7 +2867,7 @@ struct X86FrameSortingComparator { // of uses and size of object in order to minimize code size. void X86FrameLowering::orderFrameObjects( const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); // Don't waste time if there's nothing to do. if (ObjectsToAllocate.empty()) @@ -2861,16 +2878,16 @@ void X86FrameLowering::orderFrameObjects( // it easier to index into when we're counting "uses" down below. // We want to be able to easily/cheaply access an object by simply // indexing into it, instead of having to search for it every time. - std::vector<X86FrameSortingObject> SortingObjects(MFI->getObjectIndexEnd()); + std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd()); // Walk the objects we care about and mark them as such in our working // struct. for (auto &Obj : ObjectsToAllocate) { SortingObjects[Obj].IsValid = true; SortingObjects[Obj].ObjectIndex = Obj; - SortingObjects[Obj].ObjectAlignment = MFI->getObjectAlignment(Obj); + SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj); // Set the size. - int ObjectSize = MFI->getObjectSize(Obj); + int ObjectSize = MFI.getObjectSize(Obj); if (ObjectSize == 0) // Variable size. Just use 4. SortingObjects[Obj].ObjectSize = 4; @@ -2890,7 +2907,7 @@ void X86FrameLowering::orderFrameObjects( int Index = MO.getIndex(); // Check to see if it falls within our range, and is tagged // to require ordering. - if (Index >= 0 && Index < MFI->getObjectIndexEnd() && + if (Index >= 0 && Index < MFI.getObjectIndexEnd() && SortingObjects[Index].IsValid) SortingObjects[Index].ObjectNumUses++; } @@ -2938,7 +2955,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. const Function *Fn = MF.getFunction(); - if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() || + if (!STI.is64Bit() || !MF.hasEHFunclets() || classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX) return; @@ -2947,21 +2964,21 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( // object, so that we can allocate a slot immediately following it. If there // were no fixed objects, use offset -SlotSize, which is immediately after the // return address. Fixed objects have negative frame indices. - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); int64_t MinFixedObjOffset = -SlotSize; - for (int I = MFI->getObjectIndexBegin(); I < 0; ++I) - MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I)); + for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) + MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I)); for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) { for (WinEHHandlerType &H : TBME.HandlerArray) { int FrameIndex = H.CatchObj.FrameIndex; if (FrameIndex != INT_MAX) { // Ensure alignment. - unsigned Align = MFI->getObjectAlignment(FrameIndex); + unsigned Align = MFI.getObjectAlignment(FrameIndex); MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align; - MinFixedObjOffset -= MFI->getObjectSize(FrameIndex); - MFI->setObjectOffset(FrameIndex, MinFixedObjOffset); + MinFixedObjOffset -= MFI.getObjectSize(FrameIndex); + MFI.setObjectOffset(FrameIndex, MinFixedObjOffset); } } } @@ -2970,7 +2987,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8; int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; int UnwindHelpFI = - MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); + MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; // Store -2 into UnwindHelp on function entry. We have to scan forwards past diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 4a01014..e1b04d6 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -49,11 +49,10 @@ public: /// Emit target stack probe code. This is required for all /// large stack allocations on Windows. The caller is required to materialize - /// the number of bytes to probe in RAX/EAX. Returns instruction just - /// after the expansion. - MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool InProlog) const; + /// the number of bytes to probe in RAX/EAX. + void emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + bool InProlog) const; /// Replace a StackProbe inline-stub with the actual probe code inline. void inlineStackProbe(MachineFunction &MF, @@ -179,22 +178,19 @@ private: uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; /// Emit target stack probe as a call to a helper function - MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool InProlog) const; + void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + bool InProlog) const; /// Emit target stack probe as an inline sequence. - MachineInstr *emitStackProbeInline(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool InProlog) const; + void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, bool InProlog) const; /// Emit a stub to later inline the target stack probe. - MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, - bool InProlog) const; + void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, bool InProlog) const; /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 7d53b3d..8ab4c06 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" @@ -165,7 +166,7 @@ namespace { : SelectionDAGISel(tm, OptLevel), OptForSize(false), OptForMinSize(false) {} - const char *getPassName() const override { + StringRef getPassName() const override { return "X86 DAG->DAG Instruction Selection"; } @@ -182,16 +183,6 @@ namespace { void PreprocessISelDAG() override; - inline bool immSext8(SDNode *N) const { - return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); - } - - // True if the 64-bit immediate fits in a 32-bit sign-extended field. - inline bool i64immSExt32(SDNode *N) const { - uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); - return (int64_t)v == (int32_t)v; - } - // Include the pieces autogenerated from the target description. #include "X86GenDAGISel.inc" @@ -228,6 +219,7 @@ namespace { SDValue &Index, SDValue &Disp, SDValue &Segment, SDValue &NodeWithChain); + bool selectRelocImm(SDValue N, SDValue &Op); bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, @@ -1234,7 +1226,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, case ISD::UMUL_LOHI: // A mul_lohi where we need the low part can be folded as a plain multiply. if (N.getResNo() != 0) break; - // FALL THROUGH + LLVM_FALLTHROUGH; case ISD::MUL: case X86ISD::MUL_IMM: // X*[3,5,9] -> X+X*[2,4,8] @@ -1435,7 +1427,7 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDLoc DL(N); Base = Mgs->getBasePtr(); Index = Mgs->getIndex(); - unsigned ScalarSize = Mgs->getValue().getValueType().getScalarSizeInBits(); + unsigned ScalarSize = Mgs->getValue().getScalarValueSizeInBits(); Scale = getI8Imm(ScalarSize/8, DL); // If Base is 0, the whole address is in index and the Scale is 1 @@ -1512,16 +1504,39 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, SDValue &PatternNodeWithChain) { - if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) { + // We can allow a full vector load here since narrowing a load is ok. + if (ISD::isNON_EXTLoad(N.getNode())) { + PatternNodeWithChain = N; + if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && + IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) { + LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); + return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, + Segment); + } + } + + // We can also match the special zero extended load opcode. + if (N.getOpcode() == X86ISD::VZEXT_LOAD) { + PatternNodeWithChain = N; + if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && + IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) { + auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain); + return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp, + Segment); + } + } + + // Need to make sure that the SCALAR_TO_VECTOR and load are both only used + // once. Otherwise the load might get duplicated and the chain output of the + // duplicate load will not be observed by all dependencies. + if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) { PatternNodeWithChain = N.getOperand(0); if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && - PatternNodeWithChain.hasOneUse() && - IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && - IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { + IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && + IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); - if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) - return false; - return true; + return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, + Segment); } } @@ -1530,18 +1545,18 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() && // Check to see if the top elements are all zeros (or bitcast of zeros). N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && - N.getOperand(0).getNode()->hasOneUse() && - ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) && - N.getOperand(0).getOperand(0).hasOneUse() && - IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && - IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { - // Okay, this is a zero extending load. Fold it. - LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0)); - if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) - return false; - PatternNodeWithChain = SDValue(LD, 0); - return true; + N.getOperand(0).getNode()->hasOneUse()) { + PatternNodeWithChain = N.getOperand(0).getOperand(0); + if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && + IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && + IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { + // Okay, this is a zero extending load. Fold it. + LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); + return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, + Segment); + } } + return false; } @@ -1563,16 +1578,21 @@ bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { "Unexpected node type for MOV32ri64"); N = N.getOperand(0); - if (N->getOpcode() != ISD::TargetConstantPool && - N->getOpcode() != ISD::TargetJumpTable && - N->getOpcode() != ISD::TargetGlobalAddress && - N->getOpcode() != ISD::TargetExternalSymbol && - N->getOpcode() != ISD::MCSymbol && - N->getOpcode() != ISD::TargetBlockAddress) + // At least GNU as does not accept 'movl' for TPOFF relocations. + // FIXME: We could use 'movl' when we know we are targeting MC. + if (N->getOpcode() == ISD::TargetGlobalTLSAddress) return false; Imm = N; - return TM.getCodeModel() == CodeModel::Small; + if (N->getOpcode() != ISD::TargetGlobalAddress) + return TM.getCodeModel() == CodeModel::Small; + + Optional<ConstantRange> CR = + cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange(); + if (!CR) + return TM.getCodeModel() == CodeModel::Small; + + return CR->getUnsignedMax().ult(1ull << 32); } bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, @@ -1704,6 +1724,48 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, return true; } +bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { + if (auto *CN = dyn_cast<ConstantSDNode>(N)) { + Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN), + N.getValueType()); + return true; + } + + // Keep track of the original value type and whether this value was + // truncated. If we see a truncation from pointer type to VT that truncates + // bits that are known to be zero, we can use a narrow reference. + EVT VT = N.getValueType(); + bool WasTruncated = false; + if (N.getOpcode() == ISD::TRUNCATE) { + WasTruncated = true; + N = N.getOperand(0); + } + + if (N.getOpcode() != X86ISD::Wrapper) + return false; + + // We can only use non-GlobalValues as immediates if they were not truncated, + // as we do not have any range information. If we have a GlobalValue and the + // address was not truncated, we can select it as an operand directly. + unsigned Opc = N.getOperand(0)->getOpcode(); + if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { + Op = N.getOperand(0); + // We can only select the operand directly if we didn't have to look past a + // truncate. + return !WasTruncated; + } + + // Check that the global's range fits into VT. + auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0)); + Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); + if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())) + return false; + + // Okay, we can use a narrow reference. + Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT, + GA->getOffset(), GA->getTargetFlags()); + return true; +} bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, @@ -2700,7 +2762,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, case InlineAsm::Constraint_i: // FIXME: It seems strange that 'i' is needed here since it's supposed to // be an immediate and not a memory constraint. - // Fallthrough. + LLVM_FALLTHROUGH; case InlineAsm::Constraint_o: // offsetable ?? case InlineAsm::Constraint_v: // not offsetable ?? case InlineAsm::Constraint_m: // memory diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index f499e56..08fe2ba 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17,6 +17,7 @@ #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" +#include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" @@ -53,10 +54,10 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" -#include "X86IntrinsicsInfo.h" +#include <algorithm> #include <bitset> -#include <numeric> #include <cctype> +#include <numeric> using namespace llvm; #define DEBUG_TYPE "x86-isel" @@ -96,15 +97,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); - // Bypass expensive divides on Atom when compiling with O2. + // Bypass expensive divides and use cheaper ones. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) - addBypassSlowDiv(64, 16); + addBypassSlowDiv(64, 32); } - if (Subtarget.isTargetKnownWindowsMSVC()) { + if (Subtarget.isTargetKnownWindowsMSVC() || + Subtarget.isTargetWindowsItanium()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); @@ -286,7 +288,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); + } + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + if (VT == MVT::i64 && !Subtarget.is64Bit()) + continue; // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. setOperationAction(ISD::ADDC, VT, Custom); setOperationAction(ISD::ADDE, VT, Custom); @@ -349,7 +355,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. - if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { + if (Subtarget.useSoftFloat() || + (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } @@ -484,8 +491,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. - addRegisterClass(MVT::f32, &X86::FR32RegClass); - addRegisterClass(MVT::f64, &X86::FR64RegClass); + addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass + : &X86::FR32RegClass); + addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass + : &X86::FR64RegClass); for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. @@ -514,7 +523,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } else if (UseX87 && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. - addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass + : &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. @@ -590,14 +600,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { - APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); + APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); - TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, + TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); @@ -717,10 +727,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { - addRegisterClass(MVT::v4f32, &X86::VR128RegClass); + addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); @@ -730,14 +742,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { - addRegisterClass(MVT::v2f64, &X86::VR128RegClass); + addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. - addRegisterClass(MVT::v16i8, &X86::VR128RegClass); - addRegisterClass(MVT::v8i16, &X86::VR128RegClass); - addRegisterClass(MVT::v4i32, &X86::VR128RegClass); - addRegisterClass(MVT::v2i64, &X86::VR128RegClass); + addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); @@ -751,6 +768,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); setOperationAction(ISD::SMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v16i8, Legal); @@ -776,7 +794,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); - // ISD::CTTZ v2i64 - scalarization is faster. + setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { @@ -828,16 +846,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); - // As there is no 64-bit GPR available, we need build a special custom - // sequence to convert from v2i32 to v2f32. - if (!Subtarget.is64Bit()) - setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + + // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. + setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); @@ -872,8 +891,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); - // ISD::CTLZ v4i32 - scalarization is faster. - // ISD::CTLZ v2i64 - scalarization is faster. + setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { @@ -946,12 +965,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) { bool HasInt256 = Subtarget.hasInt256(); - addRegisterClass(MVT::v32i8, &X86::VR256RegClass); - addRegisterClass(MVT::v16i16, &X86::VR256RegClass); - addRegisterClass(MVT::v8i32, &X86::VR256RegClass); - addRegisterClass(MVT::v8f32, &X86::VR256RegClass); - addRegisterClass(MVT::v4i64, &X86::VR256RegClass); - addRegisterClass(MVT::v4f64, &X86::VR256RegClass); + addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); + addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass + : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); @@ -961,6 +986,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); } // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted @@ -1011,16 +1037,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); - } - - // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2 - // as we end up splitting the 256-bit vectors. - for (auto VT : { MVT::v32i8, MVT::v16i16 }) setOperationAction(ISD::CTLZ, VT, Custom); - - if (HasInt256) - for (auto VT : { MVT::v8i32, MVT::v4i64 }) - setOperationAction(ISD::CTLZ, VT, Custom); + } if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, @@ -1171,12 +1189,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); @@ -1216,10 +1236,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); } else { - setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); - setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); - setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); - setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Custom); + } } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); @@ -1230,18 +1251,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v16i1, Expand); if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + if (Subtarget.hasVLX()) { - setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion. + setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); } } if (Subtarget.hasVLX()) { @@ -1250,11 +1276,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); // FIXME. This commands are available on SSE/AVX2, add relevant patterns. setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal); @@ -1281,10 +1308,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); - if (Subtarget.hasDQI()) { - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); - } + for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); @@ -1293,6 +1317,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, VT, Legal); } + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom); + + // Without BWI we need to use custom lowering to handle MVT::v64i8 input. + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); @@ -1339,13 +1370,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::AND, VT, Legal); - setOperationAction(ISD::OR, VT, Legal); - setOperationAction(ISD::XOR, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); } + // Need to promote to 64-bit even though we have 32-bit masked instructions + // because the IR optimizers rearrange bitcasts around logic ops leaving + // too many variations to handle if we don't promote them. + setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64); + setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64); + setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64); + if (Subtarget.hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); @@ -1377,12 +1412,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Subtarget.hasCDI() if (Subtarget.hasDQI()) { - if (Subtarget.hasVLX()) { - setOperationAction(ISD::MUL, MVT::v2i64, Legal); - setOperationAction(ISD::MUL, MVT::v4i64, Legal); - } + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i64, Legal); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } + // Custom lower several nodes. for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { @@ -1413,6 +1448,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MSCATTER, VT, Custom); } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { + setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64); setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64); } }// has AVX-512 @@ -1447,6 +1483,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); @@ -1486,10 +1524,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); - setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); - if (Subtarget.hasVLX()) + if (Subtarget.hasVLX()) { + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + } LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom; for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { @@ -1532,35 +1573,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); - setOperationAction(ISD::ADD, MVT::v2i1, Expand); - setOperationAction(ISD::ADD, MVT::v4i1, Expand); - setOperationAction(ISD::SUB, MVT::v2i1, Expand); - setOperationAction(ISD::SUB, MVT::v4i1, Expand); - setOperationAction(ISD::MUL, MVT::v2i1, Expand); - setOperationAction(ISD::MUL, MVT::v4i1, Expand); - - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); - setOperationAction(ISD::SETCC, MVT::v4i1, Custom); - setOperationAction(ISD::SETCC, MVT::v2i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); + for (auto VT : { MVT::v2i1, MVT::v4i1 }) { + setOperationAction(ISD::ADD, VT, Expand); + setOperationAction(ISD::SUB, VT, Expand); + setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); - setOperationAction(ISD::SELECT, MVT::v4i1, Custom); - setOperationAction(ISD::SELECT, MVT::v2i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); - setOperationAction(ISD::VSELECT, MVT::v2i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v4i1, Expand); - - for (auto VT : { MVT::v4i32, MVT::v8i32 }) { - setOperationAction(ISD::AND, VT, Legal); - setOperationAction(ISD::OR, VT, Legal); - setOperationAction(ISD::XOR, VT, Legal); - } for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); @@ -1629,7 +1660,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. - if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC()) + if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() || + Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, ISD::FLOG10, ISD::FPOW, ISD::FSIN}) @@ -1953,9 +1985,11 @@ X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: - case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: - case MVT::v4f64: - RRC = &X86::VR128RegClass; + case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: + case MVT::v8f32: case MVT::v4f64: + case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: + case MVT::v16f32: case MVT::v8f64: + RRC = &X86::VR128XRegClass; break; } return std::make_pair(RRC, Cost); @@ -2019,6 +2053,9 @@ Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { } Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (Subtarget.getTargetTriple().isOSContiki()) + return getDefaultSafeStackPointerLocation(IRB, false); + if (!Subtarget.isTargetAndroid()) return TargetLowering::getSafeStackPointerLocation(IRB); @@ -2062,6 +2099,58 @@ const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { return ScratchRegs; } +/// Lowers masks values (v*i1) to the local register values +/// \returns DAG node after lowering to register type +static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, + const SDLoc &Dl, SelectionDAG &DAG) { + EVT ValVT = ValArg.getValueType(); + + if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || + (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { + // Two stage lowering might be required + // bitcast: v8i1 -> i8 / v16i1 -> i16 + // anyextend: i8 -> i32 / i16 -> i32 + EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; + SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); + if (ValLoc == MVT::i32) + ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy); + return ValToCopy; + } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || + (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { + // One stage lowering is required + // bitcast: v32i1 -> i32 / v64i1 -> i64 + return DAG.getBitcast(ValLoc, ValArg); + } else + return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg); +} + +/// Breaks v64i1 value into two registers and adds the new node to the DAG +static void Passv64i1ArgInRegs( + const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, + SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA, + CCValAssign &NextVA, const X86Subtarget &Subtarget) { + assert((Subtarget.hasBWI() || Subtarget.hasBMI()) && + "Expected AVX512BW or AVX512BMI target!"); + assert(Subtarget.is32Bit() && "Expecting 32 bit target"); + assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); + assert(VA.isRegLoc() && NextVA.isRegLoc() && + "The value should reside in two registers"); + + // Before splitting the value we cast it to i64 + Arg = DAG.getBitcast(MVT::i64, Arg); + + // Splitting the value into two i32 types + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, + DAG.getConstant(0, Dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, + DAG.getConstant(1, Dl, MVT::i32)); + + // Attach the two i32 types into corresponding registers + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); +} + SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2086,10 +2175,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, MVT::i32)); // Copy the result values into the output registers. - for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { - CCValAssign &VA = RVLocs[i]; + for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; + ++I, ++OutsIndex) { + CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue ValToCopy = OutVals[i]; + SDValue ValToCopy = OutVals[OutsIndex]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types. @@ -2099,7 +2189,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) - ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); } @@ -2152,9 +2242,27 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } } - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); - Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + + Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I], + Subtarget); + + assert(2 == RegsToPass.size() && + "Expecting two registers after Pass64BitArgInRegs"); + } else { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); + } + + // Add nodes to the DAG and add the values into the RetOps list + for (auto &Reg : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); + } } // Swift calling convention does not require we copy the sret argument @@ -2282,6 +2390,98 @@ EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, return VT.bitsLT(MinVT) ? MinVT : VT; } +/// Reads two 32 bit registers and creates a 64 bit mask value. +/// \param VA The current 32 bit value that need to be assigned. +/// \param NextVA The next 32 bit value that need to be assigned. +/// \param Root The parent DAG node. +/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for +/// glue purposes. In the case the DAG is already using +/// physical register instead of virtual, we should glue +/// our new SDValue to InFlag SDvalue. +/// \return a new SDvalue of size 64bit. +static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + const SDLoc &Dl, const X86Subtarget &Subtarget, + SDValue *InFlag = nullptr) { + assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); + assert(Subtarget.is32Bit() && "Expecting 32 bit target"); + assert(VA.getValVT() == MVT::v64i1 && + "Expecting first location of 64 bit width type"); + assert(NextVA.getValVT() == VA.getValVT() && + "The locations should have the same type"); + assert(VA.isRegLoc() && NextVA.isRegLoc() && + "The values should reside in two registers"); + + SDValue Lo, Hi; + unsigned Reg; + SDValue ArgValueLo, ArgValueHi; + + MachineFunction &MF = DAG.getMachineFunction(); + const TargetRegisterClass *RC = &X86::GR32RegClass; + + // Read a 32 bit value from the registers + if (nullptr == InFlag) { + // When no physical register is present, + // create an intermediate virtual register + Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); + Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); + } else { + // When a physical register is available read the value from it and glue + // the reads together. + ArgValueLo = + DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag); + *InFlag = ArgValueLo.getValue(2); + ArgValueHi = + DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag); + *InFlag = ArgValueHi.getValue(2); + } + + // Convert the i32 type into v32i1 type + Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); + + // Convert the i32 type into v32i1 type + Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); + + // Concantenate the two values together + return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); +} + +/// The function will lower a register of various sizes (8/16/32/64) +/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) +/// \returns a DAG node contains the operand after lowering to mask type. +static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, + const EVT &ValLoc, const SDLoc &Dl, + SelectionDAG &DAG) { + SDValue ValReturned = ValArg; + + if (ValVT == MVT::v64i1) { + // In 32 bit machine, this case is handled by getv64i1Argument + assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); + // In 64 bit machine, There is no need to truncate the value only bitcast + } else { + MVT maskLen; + switch (ValVT.getSimpleVT().SimpleTy) { + case MVT::v8i1: + maskLen = MVT::i8; + break; + case MVT::v16i1: + maskLen = MVT::i16; + break; + case MVT::v32i1: + maskLen = MVT::i32; + break; + default: + llvm_unreachable("Expecting a vector of i1 types"); + } + + ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); + } + + return DAG.getBitcast(ValVT, ValReturned); +} + /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// @@ -2298,13 +2498,14 @@ SDValue X86TargetLowering::LowerCallResult( CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. - for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { - CCValAssign &VA = RVLocs[i]; + for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; + ++I, ++InsIndex) { + CCValAssign &VA = RVLocs[I]; EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && - ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) { + ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2319,19 +2520,34 @@ SDValue X86TargetLowering::LowerCallResult( RoundAfterCopy = (CopyVT != VA.getLocVT()); } - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - CopyVT, InFlag).getValue(1); - SDValue Val = Chain.getValue(0); + SDValue Val; + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + Val = + getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag); + } else { + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag) + .getValue(1); + Val = Chain.getValue(0); + InFlag = Chain.getValue(2); + } if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); - if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1) - Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) { + if (VA.getValVT().isVector() && + ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || + (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { + // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 + Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); + } else + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + } - InFlag = Chain.getValue(2); InVals.push_back(Val); } @@ -2399,7 +2615,8 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE || CC == CallingConv::HHVM); + CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || + CC == CallingConv::HHVM); } /// Return true if we might ever do TCO for calls with this calling convention. @@ -2445,7 +2662,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, - MachineFrameInfo *MFI, unsigned i) const { + MachineFrameInfo &MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = shouldGuaranteeTCO( @@ -2454,9 +2671,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, EVT ValVT; // If value is passed by pointer we have address passed instead of the value - // itself. - bool ExtendedInMem = VA.isExtInLoc() && - VA.getValVT().getScalarType() == MVT::i1; + // itself. No need to extend if the mask value and location share the same + // absolute size. + bool ExtendedInMem = + VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && + VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) ValVT = VA.getLocVT(); @@ -2483,26 +2702,26 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, if (Flags.isByVal()) { unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. - int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { - MFI->setObjectOffset(FI, Offset); + MFI.setObjectOffset(FI, Offset); } return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { - int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, - VA.getLocMemOffset(), isImmutable); + int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8, + VA.getLocMemOffset(), isImmutable); // Set SExt or ZExt flag. if (VA.getLocInfo() == CCValAssign::ZExt) { - MFI->setObjectZExt(FI, true); + MFI.setObjectZExt(FI, true); } else if (VA.getLocInfo() == CCValAssign::SExt) { - MFI->setObjectSExt(FI, true); + MFI.setObjectSExt(FI, true); } // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { - MFI->setObjectOffset(FI, Offset); + MFI.setObjectOffset(FI, Offset); } SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); @@ -2562,6 +2781,13 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } +static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) { + return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), + [](const CCValAssign &A, const CCValAssign &B) -> bool { + return A.getValNo() < B.getValNo(); + }); +} + SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, @@ -2576,12 +2802,13 @@ SDValue X86TargetLowering::LowerFormalArguments( Fn->getName() == "main") FuncInfo->setForceFramePointer(true); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget.is64Bit(); bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); - assert(!(isVarArg && canGuaranteeTCO(CallConv)) && - "Var args not supported with calling convention fastcc, ghc or hipe"); + assert( + !(isVarArg && canGuaranteeTCO(CallConv)) && + "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); if (CallConv == CallingConv::X86_INTR) { bool isLegal = Ins.size() == 1 || @@ -2595,59 +2822,78 @@ SDValue X86TargetLowering::LowerFormalArguments( SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); - // Allocate shadow area for Win64 + // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); - CCInfo.AnalyzeFormalArguments(Ins, CC_X86); + CCInfo.AnalyzeArguments(Ins, CC_X86); + + // In vectorcall calling convention a second pass is required for the HVA + // types. + if (CallingConv::X86_VectorCall == CallConv) { + CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); + } + + // The next loop assumes that the locations are in the same order of the + // input arguments. + if (!isSortedByValueNo(ArgLocs)) + llvm_unreachable("Argument Location list must be sorted before lowering"); - unsigned LastVal = ~0U; SDValue ArgValue; - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - // TODO: If an arg is passed in two places (e.g. reg and stack), skip later - // places. - assert(VA.getValNo() != LastVal && - "Don't support value assigned to multiple locs yet"); - (void)LastVal; - LastVal = VA.getValNo(); + for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++InsIndex) { + assert(InsIndex < Ins.size() && "Invalid Ins index"); + CCValAssign &VA = ArgLocs[I]; if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - const TargetRegisterClass *RC; - if (RegVT == MVT::i32) - RC = &X86::GR32RegClass; - else if (Is64Bit && RegVT == MVT::i64) - RC = &X86::GR64RegClass; - else if (RegVT == MVT::f32) - RC = &X86::FR32RegClass; - else if (RegVT == MVT::f64) - RC = &X86::FR64RegClass; - else if (RegVT == MVT::f128) - RC = &X86::FR128RegClass; - else if (RegVT.is512BitVector()) - RC = &X86::VR512RegClass; - else if (RegVT.is256BitVector()) - RC = &X86::VR256RegClass; - else if (RegVT.is128BitVector()) - RC = &X86::VR128RegClass; - else if (RegVT == MVT::x86mmx) - RC = &X86::VR64RegClass; - else if (RegVT == MVT::i1) - RC = &X86::VK1RegClass; - else if (RegVT == MVT::v8i1) - RC = &X86::VK8RegClass; - else if (RegVT == MVT::v16i1) - RC = &X86::VK16RegClass; - else if (RegVT == MVT::v32i1) - RC = &X86::VK32RegClass; - else if (RegVT == MVT::v64i1) - RC = &X86::VK64RegClass; - else - llvm_unreachable("Unknown argument type!"); + if (VA.needsCustom()) { + assert( + VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + + // v64i1 values, in regcall calling convention, that are + // compiled to 32 bit arch, are splited up into two registers. + ArgValue = + getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); + } else { + const TargetRegisterClass *RC; + if (RegVT == MVT::i32) + RC = &X86::GR32RegClass; + else if (Is64Bit && RegVT == MVT::i64) + RC = &X86::GR64RegClass; + else if (RegVT == MVT::f32) + RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; + else if (RegVT == MVT::f64) + RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; + else if (RegVT == MVT::f80) + RC = &X86::RFP80RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; + else if (RegVT.is512BitVector()) + RC = &X86::VR512RegClass; + else if (RegVT.is256BitVector()) + RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; + else if (RegVT.is128BitVector()) + RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; + else if (RegVT == MVT::x86mmx) + RC = &X86::VR64RegClass; + else if (RegVT == MVT::i1) + RC = &X86::VK1RegClass; + else if (RegVT == MVT::v8i1) + RC = &X86::VK8RegClass; + else if (RegVT == MVT::v16i1) + RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; + else + llvm_unreachable("Unknown argument type!"); - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + } // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the @@ -2665,12 +2911,19 @@ SDValue X86TargetLowering::LowerFormalArguments( // Handle MMX values passed in XMM regs. if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); - else + else if (VA.getValVT().isVector() && + VA.getValVT().getScalarType() == MVT::i1 && + ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || + (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { + // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 + ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); + } else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); - ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); + ArgValue = + LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); } // If value is passed via pointer - do a load. @@ -2681,7 +2934,7 @@ SDValue X86TargetLowering::LowerFormalArguments( InVals.push_back(ArgValue); } - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + for (unsigned I = 0, E = Ins.size(); I != E; ++I) { // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. if (CallConv == CallingConv::Swift) @@ -2691,14 +2944,14 @@ SDValue X86TargetLowering::LowerFormalArguments( // sret argument into %rax/%eax (depending on ABI) for the return. Save // the argument into a virtual register so that we can access it from the // return points. - if (Ins[i].Flags.isSRet()) { + if (Ins[I].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } - SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } @@ -2713,11 +2966,10 @@ SDValue X86TargetLowering::LowerFormalArguments( // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. We // can skip this if there are no va_start calls. - if (MFI->hasVAStart() && + if (MFI.hasVAStart() && (Is64Bit || (CallConv != CallingConv::X86_FastCall && CallConv != CallingConv::X86_ThisCall))) { - FuncInfo->setVarArgsFrameIndex( - MFI->CreateFixedObject(1, StackSize, true)); + FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true)); } // Figure out if XMM registers are in use. @@ -2727,7 +2979,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. - if (Is64Bit && isVarArg && MFI->hasVAStart()) { + if (Is64Bit && isVarArg && MFI.hasVAStart()) { // Find the first unallocated argument registers. ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); @@ -2760,7 +3012,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( - MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); // Fixup to set vararg frame on shadow area (4 x i64). if (NumIntRegs < 4) FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); @@ -2770,7 +3022,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // they may be loaded by dereferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( + FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } @@ -2810,7 +3062,7 @@ SDValue X86TargetLowering::LowerFormalArguments( Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } - if (isVarArg && MFI->hasMustTailInVarArgFunc()) { + if (isVarArg && MFI.hasMustTailInVarArgFunc()) { // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. @@ -2889,7 +3141,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough // space to accommodate holding this slot at the correct offset). - int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } } @@ -2938,7 +3190,7 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = - MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, + MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, @@ -3029,11 +3281,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); - // Allocate shadow area for Win64 + // Allocate shadow area for Win64. if (IsWin64) CCInfo.AllocateStack(32, 8); - CCInfo.AnalyzeCallOperands(Outs, CC_X86); + CCInfo.AnalyzeArguments(Outs, CC_X86); + + // In vectorcall calling convention a second pass is required for the HVA + // types. + if (CallingConv::X86_VectorCall == CallConv) { + CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); + } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); @@ -3088,18 +3346,25 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 8> MemOpChains; SDValue StackPtr; + // The next loop assumes that the locations are in the same order of the + // input arguments. + if (!isSortedByValueNo(ArgLocs)) + llvm_unreachable("Argument Location list must be sorted before lowering"); + // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++OutIndex) { + assert(OutIndex < Outs.size() && "Invalid Out index"); // Skip inalloca arguments, they have already been written. - ISD::ArgFlagsTy Flags = Outs[i].Flags; + ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; if (Flags.isInAlloca()) continue; - CCValAssign &VA = ArgLocs[i]; + CCValAssign &VA = ArgLocs[I]; EVT RegVT = VA.getLocVT(); - SDValue Arg = OutVals[i]; + SDValue Arg = OutVals[OutIndex]; bool isByVal = Flags.isByVal(); // Promote the value if needed. @@ -3115,7 +3380,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, case CCValAssign::AExt: if (Arg.getValueType().isVector() && Arg.getValueType().getVectorElementType() == MVT::i1) - Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getBitcast(MVT::i64, Arg); @@ -3139,7 +3404,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } - if (VA.isRegLoc()) { + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + // Split v64i1 value into two registers + Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I], + Subtarget); + } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding @@ -3239,20 +3510,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector<SDValue, 8> MemOpChains2; SDValue FIN; int FI = 0; - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - if (VA.isRegLoc()) + for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++OutsIndex) { + CCValAssign &VA = ArgLocs[I]; + + if (VA.isRegLoc()) { + if (VA.needsCustom()) { + assert((CallConv == CallingConv::X86_RegCall) && + "Expecting custome case only in regcall calling convention"); + // This means that we are in special case where one argument was + // passed through two register locations - Skip the next location + ++I; + } + continue; + } + assert(VA.isMemLoc()); - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + SDValue Arg = OutVals[OutsIndex]; + ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; - FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { @@ -3391,7 +3674,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. - MF.getFrameInfo()->setHasTailCall(); + MF.getFrameInfo().setHasTailCall(); return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } @@ -3493,9 +3776,9 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, /// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, - MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII, const CCValAssign &VA) { - unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + unsigned Bytes = Arg.getValueSizeInBits() / 8; for (;;) { // Look through nodes that don't alter the bits of the incoming value. @@ -3558,22 +3841,22 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return false; assert(FI != INT_MAX); - if (!MFI->isFixedObjectIndex(FI)) + if (!MFI.isFixedObjectIndex(FI)) return false; - if (Offset != MFI->getObjectOffset(FI)) + if (Offset != MFI.getObjectOffset(FI)) return false; - if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) { + if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) { // If the argument location is wider than the argument type, check that any // extension flags match. - if (Flags.isZExt() != MFI->isObjectZExt(FI) || - Flags.isSExt() != MFI->isObjectSExt(FI)) { + if (Flags.isZExt() != MFI.isObjectZExt(FI) || + Flags.isSExt() != MFI.isObjectSExt(FI)) { return false; } } - return Bytes == MFI->getObjectSize(FI); + return Bytes == MFI.getObjectSize(FI); } /// Check whether the call is eligible for tail call optimization. Targets @@ -3700,7 +3983,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -3787,6 +4070,14 @@ static bool MayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } +static bool MayFoldIntoZeroExtend(SDValue Op) { + if (Op.hasOneUse()) { + unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); + return (ISD::ZERO_EXTEND == Opcode); + } + return false; +} + static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; @@ -3821,6 +4112,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: + case X86ISD::VPERMIV3: case X86ISD::VZEXT_MOVL: return true; } @@ -3829,41 +4121,18 @@ static bool isTargetShuffle(unsigned Opcode) { static bool isTargetShuffleVariableMask(unsigned Opcode) { switch (Opcode) { default: return false; + // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::VPERMILPV: + case X86ISD::VPERMIL2: + case X86ISD::VPPERM: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: + case X86ISD::VPERMIV3: + return true; + // 'Faux' Target Shuffles. + case ISD::AND: return true; - } -} - -static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT, - SDValue V1, unsigned TargetMask, - SelectionDAG &DAG) { - switch(Opc) { - default: llvm_unreachable("Unknown x86 shuffle node"); - case X86ISD::PSHUFD: - case X86ISD::PSHUFHW: - case X86ISD::PSHUFLW: - case X86ISD::VPERMILPI: - case X86ISD::VPERMI: - return DAG.getNode(Opc, dl, VT, V1, - DAG.getConstant(TargetMask, dl, MVT::i8)); - } -} - -static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT, - SDValue V1, SDValue V2, SelectionDAG &DAG) { - switch(Opc) { - default: llvm_unreachable("Unknown x86 shuffle node"); - case X86ISD::MOVLHPS: - case X86ISD::MOVLHPD: - case X86ISD::MOVHLPS: - case X86ISD::MOVLPS: - case X86ISD::MOVLPD: - case X86ISD::MOVSS: - case X86ISD::MOVSD: - case X86ISD::UNPCKL: - case X86ISD::UNPCKH: - return DAG.getNode(Opc, dl, VT, V1, V2); } } @@ -3876,9 +4145,9 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); - ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, - -(int64_t)SlotSize, - false); + ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, + -(int64_t)SlotSize, + false); FuncInfo->setRAIndex(ReturnAddrIndex); } @@ -3974,7 +4243,7 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. -static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, +static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { @@ -4175,6 +4444,10 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } +bool X86TargetLowering::isCtlzFast() const { + return Subtarget.hasFastLZCNT(); +} + bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; @@ -4187,11 +4460,21 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { return true; } +/// Val is the undef sentinel value or equal to the specified value. +static bool isUndefOrEqual(int Val, int CmpVal) { + return ((Val == SM_SentinelUndef) || (Val == CmpVal)); +} + +/// Val is either the undef or zero sentinel value. +static bool isUndefOrZero(int Val) { + return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); +} + /// Return true if every element in Mask, beginning -/// from position Pos and ending in Pos+Size is undef. +/// from position Pos and ending in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) - if (0 <= Mask[i]) + if (Mask[i] != SM_SentinelUndef) return false; return true; } @@ -4199,7 +4482,7 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { - return (Val < 0) || (Val >= Low && Val < Hi); + return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi); } /// Return true if every element in Mask is undef or if its value @@ -4212,14 +4495,19 @@ static bool isUndefOrInRange(ArrayRef<int> Mask, return true; } -/// Val is either less than zero (undef) or equal to the specified value. -static bool isUndefOrEqual(int Val, int CmpVal) { - return (Val < 0 || Val == CmpVal); +/// Return true if Val is undef, zero or if its value falls within the +/// specified range (L, H]. +static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { + return isUndefOrZero(Val) || (Val >= Low && Val < Hi); } -/// Val is either the undef or zero sentinel value. -static bool isUndefOrZero(int Val) { - return (Val == SM_SentinelUndef || Val == SM_SentinelZero); +/// Return true if every element in Mask is undef, zero or if its value +/// falls within the specified range (L, H]. +static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) { + for (int M : Mask) + if (!isUndefOrZeroOrInRange(M, Low, Hi)) + return false; + return true; } /// Return true if every element in Mask, beginning @@ -4244,6 +4532,100 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, return true; } +/// Return true if every element in Mask, beginning +/// from position Pos and ending in Pos+Size is undef or is zero. +static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, + unsigned Size) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i) + if (!isUndefOrZero(Mask[i])) + return false; + return true; +} + +/// \brief Helper function to test whether a shuffle mask could be +/// simplified by widening the elements being shuffled. +/// +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef<int> Mask, + SmallVectorImpl<int> &WidenedMask) { + WidenedMask.assign(Mask.size() / 2, 0); + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // If both elements are undef, its trivial. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + WidenedMask[i / 2] = SM_SentinelUndef; + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && + Mask[i + 1] % 2 == 1) { + WidenedMask[i / 2] = Mask[i + 1] / 2; + continue; + } + if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { + WidenedMask[i / 2] = Mask[i] / 2; + continue; + } + + // When zeroing, we need to spread the zeroing across both lanes to widen. + if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { + if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && + (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + WidenedMask[i / 2] = SM_SentinelZero; + continue; + } + return false; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && + Mask[i] + 1 == Mask[i + 1]) { + WidenedMask[i / 2] = Mask[i] / 2; + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); + + return true; +} + +/// Helper function to scale a shuffle or target shuffle mask, replacing each +/// mask index with the scaled sequential indices for an equivalent narrowed +/// mask. This is the reverse process to canWidenShuffleElements, but can always +/// succeed. +static void scaleShuffleMask(int Scale, ArrayRef<int> Mask, + SmallVectorImpl<int> &ScaledMask) { + assert(0 < Scale && "Unexpected scaling factor"); + int NumElts = Mask.size(); + ScaledMask.assign(NumElts * Scale, -1); + + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + + // Repeat sentinel values in every mask element. + if (M < 0) { + for (int s = 0; s != Scale; ++s) + ScaledMask[(Scale * i) + s] = M; + continue; + } + + // Scale mask element and increment across each mask element. + for (int s = 0; s != Scale; ++s) + ScaledMask[(Scale * i) + s] = (Scale * M) + s; + } +} + /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector /// extract that is suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { @@ -4256,7 +4638,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); - unsigned ElSize = VT.getVectorElementType().getSizeInBits(); + unsigned ElSize = VT.getScalarSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; @@ -4274,7 +4656,7 @@ static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); - unsigned ElSize = VT.getVectorElementType().getSizeInBits(); + unsigned ElSize = VT.getScalarSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; @@ -4388,6 +4770,46 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, return ConstsNode; } +static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs, + MVT VT, SelectionDAG &DAG, const SDLoc &dl) { + assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays"); + SmallVector<SDValue, 32> Ops; + bool Split = false; + + MVT ConstVecVT = VT; + unsigned NumElts = VT.getVectorNumElements(); + bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); + if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { + ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); + Split = true; + } + + MVT EltVT = ConstVecVT.getVectorElementType(); + for (unsigned i = 0, e = Bits.size(); i != e; ++i) { + if (Undefs[i]) { + Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); + continue; + } + const APInt &V = Bits[i]; + assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); + if (Split) { + Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); + Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); + } else if (EltVT == MVT::f32) { + APFloat FV(APFloat::IEEEsingle(), V); + Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); + } else if (EltVT == MVT::f64) { + APFloat FV(APFloat::IEEEdouble(), V); + Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); + } else { + Ops.push_back(DAG.getConstant(V, dl, EltVT)); + } + } + + SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); + return DAG.getBitcast(VT, ConstsNode); +} + /// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { @@ -4416,8 +4838,6 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { - assert((vectorWidth == 128 || vectorWidth == 256) && - "Unsupported vector width"); EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/vectorWidth; @@ -4438,8 +4858,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, - dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, + makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -4694,29 +5114,35 @@ static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget, return DAG.getBitcast(VT, Vec); } +/// Generate unpacklo/unpackhi shuffle mask. +static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo, + bool Unary) { + assert(Mask.empty() && "Expected an empty shuffle mask vector"); + int NumElts = VT.getVectorNumElements(); + int NumEltsInLane = 128 / VT.getScalarSizeInBits(); + + for (int i = 0; i < NumElts; ++i) { + unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; + int Pos = (i % NumEltsInLane) / 2 + LaneStart; + Pos += (Unary ? 0 : NumElts * (i % 2)); + Pos += (Lo ? 0 : NumEltsInLane / 2); + Mask.push_back(Pos); + } +} + /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { - assert(VT.is128BitVector() && "Expected a 128-bit vector type"); - unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> Mask(NumElems); - for (unsigned i = 0, e = NumElems/2; i != e; ++i) { - Mask[i * 2] = i; - Mask[i * 2 + 1] = i + NumElems; - } + SmallVector<int, 8> Mask; + createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { - assert(VT.is128BitVector() && "Expected a 128-bit vector type"); - unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> Mask(NumElems); - for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { - Mask[i * 2] = i + Half; - Mask[i * 2 + 1] = i + NumElems + Half; - } + SmallVector<int, 8> Mask; + createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } @@ -4745,6 +5171,135 @@ static SDValue peekThroughBitcasts(SDValue V) { return V; } +static SDValue peekThroughOneUseBitcasts(SDValue V) { + while (V.getNode() && V.getOpcode() == ISD::BITCAST && + V.getOperand(0).hasOneUse()) + V = V.getOperand(0); + return V; +} + +static const Constant *getTargetConstantFromNode(SDValue Op) { + Op = peekThroughBitcasts(Op); + + auto *Load = dyn_cast<LoadSDNode>(Op); + if (!Load) + return nullptr; + + SDValue Ptr = Load->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!CNode || CNode->isMachineConstantPoolEntry()) + return nullptr; + + return dyn_cast<Constant>(CNode->getConstVal()); +} + +// Extract raw constant bits from constant pools. +static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, + SmallBitVector &UndefElts, + SmallVectorImpl<APInt> &EltBits) { + assert(UndefElts.empty() && "Expected an empty UndefElts vector"); + assert(EltBits.empty() && "Expected an empty EltBits vector"); + + Op = peekThroughBitcasts(Op); + + EVT VT = Op.getValueType(); + unsigned SizeInBits = VT.getSizeInBits(); + assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); + unsigned NumElts = SizeInBits / EltSizeInBits; + + // Extract all the undef/constant element data and pack into single bitsets. + APInt UndefBits(SizeInBits, 0); + APInt MaskBits(SizeInBits, 0); + + // Split the undef/constant single bitset data into the target elements. + auto SplitBitData = [&]() { + UndefElts = SmallBitVector(NumElts, false); + EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); + + for (unsigned i = 0; i != NumElts; ++i) { + APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits); + UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits); + + // Only treat an element as UNDEF if all bits are UNDEF, otherwise + // treat it as zero. + if (UndefEltBits.isAllOnesValue()) { + UndefElts[i] = true; + continue; + } + + APInt Bits = MaskBits.lshr(i * EltSizeInBits); + Bits = Bits.zextOrTrunc(EltSizeInBits); + EltBits[i] = Bits.getZExtValue(); + } + return true; + }; + + auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask, + APInt &Undefs) { + if (!Cst) + return false; + unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits(); + if (isa<UndefValue>(Cst)) { + Mask = APInt::getNullValue(SizeInBits); + Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits); + return true; + } + if (auto *CInt = dyn_cast<ConstantInt>(Cst)) { + Mask = CInt->getValue().zextOrTrunc(SizeInBits); + Undefs = APInt::getNullValue(SizeInBits); + return true; + } + if (auto *CFP = dyn_cast<ConstantFP>(Cst)) { + Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); + Undefs = APInt::getNullValue(SizeInBits); + return true; + } + return false; + }; + + // Extract constant bits from constant pool vector. + if (auto *Cst = getTargetConstantFromNode(Op)) { + Type *CstTy = Cst->getType(); + if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits())) + return false; + + unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); + for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) { + APInt Bits, Undefs; + if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs)) + return false; + MaskBits |= Bits.shl(i * CstEltSizeInBits); + UndefBits |= Undefs.shl(i * CstEltSizeInBits); + } + + return SplitBitData(); + } + + // Extract constant bits from a broadcasted constant pool scalar. + if (Op.getOpcode() == X86ISD::VBROADCAST && + EltSizeInBits <= Op.getScalarValueSizeInBits()) { + if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { + APInt Bits, Undefs; + if (ExtractConstantBits(Broadcast, Bits, Undefs)) { + unsigned NumBroadcastBits = Op.getScalarValueSizeInBits(); + unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits; + for (unsigned i = 0; i != NumBroadcastElts; ++i) { + MaskBits |= Bits.shl(i * NumBroadcastBits); + UndefBits |= Undefs.shl(i * NumBroadcastBits); + } + return SplitBitData(); + } + } + } + + return false; +} + +// TODO: Merge more of this with getTargetConstantBitsFromNode. static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl<uint64_t> &RawMask) { @@ -4752,6 +5307,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, MVT VT = MaskNode.getSimpleValueType(); assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); + unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits; // Split an APInt element into MaskEltSizeInBits sized pieces and // insert into the shuffle mask. @@ -4783,17 +5339,20 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL && MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) { - - // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 - if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0) - return false; - unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits; - SDValue MaskOp = MaskNode.getOperand(0).getOperand(0); if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) { - SplitElementToMask(CN->getAPIntValue()); - RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0); - return true; + if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) { + RawMask.push_back(CN->getZExtValue()); + RawMask.append(NumMaskElts - 1, 0); + return true; + } + + if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) { + unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits; + SplitElementToMask(CN->getAPIntValue()); + RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0); + return true; + } } return false; } @@ -4803,8 +5362,8 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, // We can always decode if the buildvector is all zero constants, // but can't use isBuildVectorAllZeros as it might contain UNDEFs. - if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) { - RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0); + if (all_of(MaskNode->ops(), X86::isZeroNode)) { + RawMask.append(NumMaskElts, 0); return true; } @@ -4824,25 +5383,6 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, return true; } -static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) { - MaskNode = peekThroughBitcasts(MaskNode); - - auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); - if (!MaskLoad) - return nullptr; - - SDValue Ptr = MaskLoad->getBasePtr(); - if (Ptr->getOpcode() == X86ISD::Wrapper || - Ptr->getOpcode() == X86ISD::WrapperRIP) - Ptr = Ptr->getOperand(0); - - auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); - if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) - return nullptr; - - return dyn_cast<Constant>(MaskCP->getConstVal()); -} - /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. @@ -4896,6 +5436,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(0)); break; case X86ISD::VSHLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); @@ -4947,7 +5490,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeVPERMILPMask(VT, RawMask, Mask); break; } - if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPERMILPMask(C, MaskEltSize, Mask); break; } @@ -4961,7 +5504,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodePSHUFBMask(RawMask, Mask); break; } - if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodePSHUFBMask(C, Mask); break; } @@ -5010,7 +5553,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask); break; } - if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask); break; } @@ -5025,7 +5568,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeVPPERMMask(RawMask, Mask); break; } - if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + if (auto *C = getTargetConstantFromNode(MaskNode)) { DecodeVPPERMMask(C, Mask); break; } @@ -5042,8 +5585,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeVPERMVMask(RawMask, Mask); break; } - if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { - DecodeVPERMVMask(C, VT, Mask); + if (auto *C = getTargetConstantFromNode(MaskNode)) { + DecodeVPERMVMask(C, MaskEltSize, Mask); break; } return false; @@ -5054,8 +5597,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, Ops.push_back(N->getOperand(0)); Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(1); - if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { - DecodeVPERMV3Mask(C, VT, Mask); + unsigned MaskEltSize = VT.getScalarSizeInBits(); + if (auto *C = getTargetConstantFromNode(MaskNode)) { + DecodeVPERMV3Mask(C, MaskEltSize, Mask); + break; + } + return false; + } + case X86ISD::VPERMIV3: { + IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2); + // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one. + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(2)); + SDValue MaskNode = N->getOperand(0); + unsigned MaskEltSize = VT.getScalarSizeInBits(); + if (auto *C = getTargetConstantFromNode(MaskNode)) { + DecodeVPERMV3Mask(C, MaskEltSize, Mask); break; } return false; @@ -5069,7 +5626,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero) - if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; })) + if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) return false; // If we have a fake unary shuffle, the shuffle mask is spread across two @@ -5101,8 +5658,9 @@ static bool setTargetShuffleZeroElements(SDValue N, bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; - if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops, - Mask, IsUnary)) + + MVT VT = N.getSimpleValueType(); + if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) return false; SDValue V1 = Ops[0]; @@ -5164,9 +5722,94 @@ static bool setTargetShuffleZeroElements(SDValue N, } } + assert(VT.getVectorNumElements() == Mask.size() && + "Different mask size from vector size!"); return true; } +// Attempt to decode ops that could be represented as a shuffle mask. +// The decoded shuffle mask may contain a different number of elements to the +// destination value type. +static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, + SmallVectorImpl<SDValue> &Ops) { + Mask.clear(); + Ops.clear(); + + MVT VT = N.getSimpleValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumSizeInBits = VT.getSizeInBits(); + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 && + "Expected byte aligned value types"); + + unsigned Opcode = N.getOpcode(); + switch (Opcode) { + case ISD::AND: { + // Attempt to decode as a per-byte mask. + SmallBitVector UndefElts; + SmallVector<APInt, 32> EltBits; + if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits)) + return false; + for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { + if (UndefElts[i]) { + Mask.push_back(SM_SentinelUndef); + continue; + } + uint64_t ByteBits = EltBits[i].getZExtValue(); + if (ByteBits != 0 && ByteBits != 255) + return false; + Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i); + } + Ops.push_back(N.getOperand(0)); + return true; + } + case X86ISD::VSHLI: + case X86ISD::VSRLI: { + uint64_t ShiftVal = N.getConstantOperandVal(1); + // Out of range bit shifts are guaranteed to be zero. + if (NumBitsPerElt <= ShiftVal) { + Mask.append(NumElts, SM_SentinelZero); + return true; + } + + // We can only decode 'whole byte' bit shifts as shuffles. + if ((ShiftVal % 8) != 0) + break; + + uint64_t ByteShift = ShiftVal / 8; + unsigned NumBytes = NumSizeInBits / 8; + unsigned NumBytesPerElt = NumBitsPerElt / 8; + Ops.push_back(N.getOperand(0)); + + // Clear mask to all zeros and insert the shifted byte indices. + Mask.append(NumBytes, SM_SentinelZero); + + if (X86ISD::VSHLI == Opcode) { + for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) + for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) + Mask[i + j] = i + j - ByteShift; + } else { + for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) + for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) + Mask[i + j - ByteShift] = i + j; + } + return true; + } + case X86ISD::VZEXT: { + // TODO - add support for VPMOVZX with smaller input vector types. + SDValue Src = N.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + if (NumSizeInBits != SrcVT.getSizeInBits()) + break; + DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask); + Ops.push_back(Src); + return true; + } + } + + return false; +} + /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the /// remaining input indices in case we now have a unary shuffle and adjust the @@ -5176,14 +5819,14 @@ static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1, SmallVectorImpl<int> &Mask) { SmallVector<SDValue, 2> Ops; if (!setTargetShuffleZeroElements(Op, Mask, Ops)) - return false; + if (!getFauxShuffleMask(Op, Mask, Ops)) + return false; int NumElts = Mask.size(); - bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) { + bool Op0InUse = any_of(Mask, [NumElts](int Idx) { return 0 <= Idx && Idx < NumElts; }); - bool Op1InUse = std::any_of(Mask.begin(), Mask.end(), - [NumElts](int Idx) { return NumElts <= Idx; }); + bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; }); Op0 = Op0InUse ? Ops[0] : SDValue(); Op1 = Op1InUse ? Ops[1] : SDValue(); @@ -5523,15 +6166,15 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { - if (MFI->isFixedObjectIndex(FI)) { + if (MFI.isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { - MFI->setObjectAlignment(FI, RequiredAlign); + MFI.setObjectAlignment(FI, RequiredAlign); } } @@ -5697,11 +6340,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, int LoadSize = (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); - // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs. - if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 && + // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. + if (IsConsecutiveLoad && FirstLoadedElt == 0 && + (LoadSize == 32 || LoadSize == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { - MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; - MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64); + MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize) + : MVT::getIntegerVT(LoadSize); + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize); if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; @@ -5728,31 +6373,53 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs. - if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 && - ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { - MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32; - MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32); - if (TLI.isTypeLegal(VecVT)) { - SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase) - : DAG.getBitcast(VecSVT, EltBase); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V); - return DAG.getBitcast(VT, V); - } + return SDValue(); +} + +static Constant *getConstantVector(MVT VT, APInt SplatValue, + unsigned SplatBitSize, LLVMContext &C) { + unsigned ScalarSize = VT.getScalarSizeInBits(); + unsigned NumElm = SplatBitSize / ScalarSize; + + SmallVector<Constant *, 32> ConstantVec; + for (unsigned i = 0; i < NumElm; i++) { + APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize); + Constant *Const; + if (VT.isFloatingPoint()) { + assert((ScalarSize == 32 || ScalarSize == 64) && + "Unsupported floating point scalar size"); + if (ScalarSize == 32) + Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat()); + else + Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble()); + } else + Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); + ConstantVec.push_back(Const); } + return ConstantVector::get(ArrayRef<Constant *>(ConstantVec)); +} - return SDValue(); +static bool isUseOfShuffle(SDNode *N) { + for (auto *U : N->uses()) { + if (isTargetShuffle(U->getOpcode())) + return true; + if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts + return isUseOfShuffle(U); + } + return false; } /// Attempt to use the vbroadcast instruction to generate a splat value for the /// following cases: -/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. +/// 1. A splat BUILD_VECTOR which uses: +/// a. A single scalar load, or a constant. +/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load, or a constant. +/// /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. -static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget, +static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE @@ -5760,81 +6427,103 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget, if (!Subtarget.hasAVX()) return SDValue(); - MVT VT = Op.getSimpleValueType(); - SDLoc dl(Op); + MVT VT = BVOp->getSimpleValueType(0); + SDLoc dl(BVOp); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); - SDValue Ld; - bool ConstSplatVal; - - switch (Op.getOpcode()) { - default: - // Unknown pattern found. - return SDValue(); - - case ISD::BUILD_VECTOR: { - auto *BVOp = cast<BuildVectorSDNode>(Op.getNode()); - BitVector UndefElements; - SDValue Splat = BVOp->getSplatValue(&UndefElements); - - // We need a splat of a single value to use broadcast, and it doesn't - // make any sense if the value is only in one element of the vector. - if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) + BitVector UndefElements; + SDValue Ld = BVOp->getSplatValue(&UndefElements); + + // We need a splat of a single value to use broadcast, and it doesn't + // make any sense if the value is only in one element of the vector. + if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { + APInt SplatValue, Undef; + unsigned SplatBitSize; + bool HasUndef; + // Check if this is a repeated constant pattern suitable for broadcasting. + if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && + SplatBitSize > VT.getScalarSizeInBits() && + SplatBitSize < VT.getSizeInBits()) { + // Avoid replacing with broadcast when it's a use of a shuffle + // instruction to preserve the present custom lowering of shuffles. + if (isUseOfShuffle(BVOp) || BVOp->hasOneUse()) return SDValue(); - - Ld = Splat; - ConstSplatVal = (Ld.getOpcode() == ISD::Constant || - Ld.getOpcode() == ISD::ConstantFP); - - // Make sure that all of the users of a non-constant load are from the - // BUILD_VECTOR node. - if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) - return SDValue(); - break; - } - - case ISD::VECTOR_SHUFFLE: { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - - // Shuffles must have a splat mask where the first element is - // broadcasted. - if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) - return SDValue(); - - SDValue Sc = Op.getOperand(0); - if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && - Sc.getOpcode() != ISD::BUILD_VECTOR) { - - if (!Subtarget.hasInt256()) - return SDValue(); - - // Use the register form of the broadcast instruction available on AVX2. - if (VT.getSizeInBits() >= 256) - Sc = extract128BitVector(Sc, 0, DAG, dl); - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); + // replace BUILD_VECTOR with broadcast of the repeated constants. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + LLVMContext *Ctx = DAG.getContext(); + MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); + if (Subtarget.hasAVX()) { + if (SplatBitSize <= 64 && Subtarget.hasAVX2() && + !(SplatBitSize == 64 && Subtarget.is32Bit())) { + // Splatted value can fit in one INTEGER constant in constant pool. + // Load the constant and broadcast it. + MVT CVT = MVT::getIntegerVT(SplatBitSize); + Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize); + Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); + SDValue CP = DAG.getConstantPool(C, PVT); + unsigned Repeat = VT.getSizeInBits() / SplatBitSize; + + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, + MVT::getVectorVT(CVT, Repeat), Ld); + return DAG.getBitcast(VT, Brdcst); + } else if (SplatBitSize == 32 || SplatBitSize == 64) { + // Splatted value can fit in one FLOAT constant in constant pool. + // Load the constant and broadcast it. + // AVX have support for 32 and 64 bit broadcast for floats only. + // No 64bit integer in 32bit subtarget. + MVT CVT = MVT::getFloatingPointVT(SplatBitSize); + Constant *C = SplatBitSize == 32 + ? ConstantFP::get(Type::getFloatTy(*Ctx), + SplatValue.bitsToFloat()) + : ConstantFP::get(Type::getDoubleTy(*Ctx), + SplatValue.bitsToDouble()); + SDValue CP = DAG.getConstantPool(C, PVT); + unsigned Repeat = VT.getSizeInBits() / SplatBitSize; + + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, + MVT::getVectorVT(CVT, Repeat), Ld); + return DAG.getBitcast(VT, Brdcst); + } else if (SplatBitSize > 64) { + // Load the vector of constants and broadcast it. + MVT CVT = VT.getScalarType(); + Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, + *Ctx); + SDValue VCP = DAG.getConstantPool(VecC, PVT); + unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); + unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment(); + Ld = DAG.getLoad( + MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); + return DAG.getBitcast(VT, Brdcst); + } } - - Ld = Sc.getOperand(0); - ConstSplatVal = (Ld.getOpcode() == ISD::Constant || - Ld.getOpcode() == ISD::ConstantFP); - - // The scalar_to_vector node and the suspected - // load node must have exactly one user. - // Constants may have multiple users. - - // AVX-512 has register version of the broadcast - bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() && - Ld.getValueType().getSizeInBits() >= 32; - if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && - !hasRegVer)) - return SDValue(); - break; } + return SDValue(); } - unsigned ScalarSize = Ld.getValueType().getSizeInBits(); + bool ConstSplatVal = + (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); + + // Make sure that all of the users of a non-constant load are from the + // BUILD_VECTOR node. + if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) + return SDValue(); + + unsigned ScalarSize = Ld.getValueSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast @@ -6025,8 +6714,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; } SDLoc dl(Op); - MVT VT = - MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8)); + MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); return DAG.getConstant(Immediate, dl, VT); } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. @@ -6273,23 +6961,24 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } -/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB -/// node. -static SDValue LowerToAddSub(const BuildVectorSDNode *BV, - const X86Subtarget &Subtarget, SelectionDAG &DAG) { +/// Returns true iff \p BV builds a vector with the result equivalent to +/// the result of ADDSUB operation. +/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation +/// are written to the parameters \p Opnd0 and \p Opnd1. +static bool isAddSub(const BuildVectorSDNode *BV, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + SDValue &Opnd0, SDValue &Opnd1) { + MVT VT = BV->getSimpleValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && - (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) - return SDValue(); + (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && + (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) + return false; - SDLoc DL(BV); unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); - assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || - VT == MVT::v2f64) && "build_vector with an invalid type found!"); - // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from @@ -6311,7 +7000,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) - return SDValue(); + return false; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -6324,11 +7013,11 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, !isa<ConstantSDNode>(Op0.getOperand(1)) || !isa<ConstantSDNode>(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) - return SDValue(); + return false; unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); if (I0 != i) - return SDValue(); + return false; // We found a valid add/sub node. Update the information accordingly. if (i & 1) @@ -6340,39 +7029,118 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, if (InVec0.isUndef()) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) - return SDValue(); + return false; } if (InVec1.isUndef()) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) - return SDValue(); + return false; } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (ExpectedOpcode == ISD::FSUB) - return SDValue(); + return false; // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) - return SDValue(); + return false; } if (InVec1 != Op1.getOperand(0)) - return SDValue(); + return false; // Update the pair of expected opcodes. std::swap(ExpectedOpcode, NextExpectedOpcode); } // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. - if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef()) - return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); + if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef()) + return false; - return SDValue(); + Opnd0 = InVec0; + Opnd1 = InVec1; + return true; +} + +/// Returns true if is possible to fold MUL and an idiom that has already been +/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1). +/// If (and only if) true is returned, the operands of FMADDSUB are written to +/// parameters \p Opnd0, \p Opnd1, \p Opnd2. +/// +/// Prior to calling this function it should be known that there is some +/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation +/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called +/// before replacement of such SDNode with ADDSUB operation. Thus the number +/// of \p Opnd0 uses is expected to be equal to 2. +/// For example, this function may be called for the following IR: +/// %AB = fmul fast <2 x double> %A, %B +/// %Sub = fsub fast <2 x double> %AB, %C +/// %Add = fadd fast <2 x double> %AB, %C +/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, +/// <2 x i32> <i32 0, i32 3> +/// There is a def for %Addsub here, which potentially can be replaced by +/// X86ISD::ADDSUB operation: +/// %Addsub = X86ISD::ADDSUB %AB, %C +/// and such ADDSUB can further be replaced with FMADDSUB: +/// %Addsub = FMADDSUB %A, %B, %C. +/// +/// The main reason why this method is called before the replacement of the +/// recognized ADDSUB idiom with ADDSUB operation is that such replacement +/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit +/// FMADDSUB is. +static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG, + SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) { + if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 || + !Subtarget.hasAnyFMA()) + return false; + + // FIXME: These checks must match the similar ones in + // DAGCombiner::visitFADDForFMACombine. It would be good to have one + // function that would answer if it is Ok to fuse MUL + ADD to FMADD + // or MUL + ADDSUB to FMADDSUB. + const TargetOptions &Options = DAG.getTarget().Options; + bool AllowFusion = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); + if (!AllowFusion) + return false; + + Opnd2 = Opnd1; + Opnd1 = Opnd0.getOperand(1); + Opnd0 = Opnd0.getOperand(0); + + return true; +} + +/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation +/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node. +static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue Opnd0, Opnd1; + if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1)) + return SDValue(); + + MVT VT = BV->getSimpleValueType(0); + SDLoc DL(BV); + + // Try to generate X86ISD::FMADDSUB node here. + SDValue Opnd2; + if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2)) + return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); + + // Do not generate X86ISD::ADDSUB node for 512-bit types even though + // the ADDSUB idiom has been successfully recognized. There are no known + // X86 targets with 512-bit ADDSUB instructions! + // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom + // recognition. + if (VT.is512BitVector()) + return SDValue(); + + return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. @@ -6510,17 +7278,18 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, /// NOTE: Its not in our interest to start make a general purpose vectorizer /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. -static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) { +static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, + SelectionDAG &DAG) { SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); + MVT VT = Op->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Check that all elements have the same opcode. // TODO: Should we allow UNDEFS and if so how many? - unsigned Opcode = Op.getOperand(0).getOpcode(); + unsigned Opcode = Op->getOperand(0).getOpcode(); for (unsigned i = 1; i < NumElems; ++i) - if (Opcode != Op.getOperand(i).getOpcode()) + if (Opcode != Op->getOperand(i).getOpcode()) return SDValue(); // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). @@ -6600,13 +7369,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return VectorConstant; BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); - if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) + if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; - if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) + if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG)) return Broadcast; - if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG)) + if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) return BitOp; unsigned EVTBits = ExtVT.getSizeInBits(); @@ -6673,12 +7442,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget.is64Bit())) { - if (VT.is512BitVector()) { - SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, - Item, DAG.getIntPtrConstant(0, dl)); - } - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.is128BitVector() || VT.is256BitVector() || + VT.is512BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. @@ -7088,6 +7853,7 @@ static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { + assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0); if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) @@ -7122,26 +7888,40 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); } -static void scaleShuffleMask(int Scale, ArrayRef<int> Mask, - SmallVectorImpl<int> &ScaledMask) { - assert(0 < Scale && "Unexpected scaling factor"); - int NumElts = Mask.size(); - ScaledMask.assign(NumElts * Scale, -1); - - for (int i = 0; i != NumElts; ++i) { - int M = Mask[i]; - - // Repeat sentinel values in every mask element. - if (M < 0) { - for (int s = 0; s != Scale; ++s) - ScaledMask[(Scale * i) + s] = M; +/// Test whether a target shuffle mask is equivalent within each sub-lane. +/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. +static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, + ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); + RepeatedMask.assign(LaneSize, SM_SentinelUndef); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); + if (Mask[i] == SM_SentinelUndef) + continue; + if (Mask[i] == SM_SentinelZero) { + if (!isUndefOrZero(RepeatedMask[i % LaneSize])) + return false; + RepeatedMask[i % LaneSize] = SM_SentinelZero; continue; } + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; - // Scale mask element and increment across each mask element. - for (int s = 0; s != Scale; ++s) - ScaledMask[(Scale * i) + s] = (Scale * M) + s; + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + // Adjust second vector indices to start at LaneSize instead of Size. + int LocalM = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; + if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = LocalM; + else if (RepeatedMask[i % LaneSize] != LocalM) + // Found a mismatch with the repeated mask. + return false; } + return true; } /// \brief Checks whether a shuffle mask is equivalent to an explicit list of @@ -7251,7 +8031,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - int VectorSizeInBits = V1.getValueType().getSizeInBits(); + int VectorSizeInBits = V1.getValueSizeInBits(); int ScalarSizeInBits = VectorSizeInBits / Mask.size(); assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); @@ -7309,11 +8089,42 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, return Zeroable; } -/// Try to lower a shuffle with a single PSHUFB of V1. -/// This is only possible if V2 is unused (at all, or only for zero elements). +// The Shuffle result is as follow: +// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. +// Each Zeroable's element correspond to a particular Mask's element. +// As described in computeZeroableShuffleElements function. +// +// The function looks for a sub-mask that the nonzero elements are in +// increasing order. If such sub-mask exist. The function returns true. +static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, + ArrayRef<int> Mask,const EVT &VectorType, + bool &IsZeroSideLeft) { + int NextElement = -1; + // Check if the Mask's nonzero elements are in increasing order. + for (int i = 0, e = Zeroable.size(); i < e; i++) { + // Checks if the mask's zeros elements are built from only zeros. + if (Mask[i] == -1) + return false; + if (Zeroable[i]) + continue; + // Find the lowest non zero element + if (NextElement == -1) { + NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; + IsZeroSideLeft = NextElement != 0; + } + // Exit if the mask's non zero elements are not in increasing order. + if (NextElement != Mask[i]) + return false; + NextElement++; + } + return true; +} + +/// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, + const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -7325,12 +8136,11 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())); - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - SmallVector<SDValue, 64> PSHUFBMask(NumBytes); // Sign bit set in i8 mask means zero element. SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); + SDValue V; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / NumEltBytes]; if (M < 0) { @@ -7341,9 +8151,13 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, PSHUFBMask[i] = ZeroMask; continue; } - // Only allow V1. - if (M >= Size) + + // We can only use a single input of V1 or V2. + SDValue SrcV = (M >= Size ? V2 : V1); + if (V && V != SrcV) return SDValue(); + V = SrcV; + M %= Size; // PSHUFB can't cross lanes, ensure this doesn't happen. if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) @@ -7353,33 +8167,66 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, M = M * NumEltBytes + (i % NumEltBytes); PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); } + assert(V && "Failed to find a source input"); MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1), + VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl); + +// Function convertBitVectorToUnsigned - The function gets SmallBitVector +// as argument and convert him to unsigned. +// The output of the function is not(zeroable) +static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) { + unsigned convertBit = 0; + for (int i = 0, e = Zeroable.size(); i < e; i++) + convertBit |= !(Zeroable[i]) << i; + return convertBit; +} + +// X86 has dedicated shuffle that can be lowered to VEXPAND +static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, + const SmallBitVector &Zeroable, + ArrayRef<int> Mask, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsLeftZeroSide = true; + if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), + IsLeftZeroSide)) + return SDValue(); + unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable); + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); + unsigned NumElts = VT.getVectorNumElements(); + assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && + "Unexpected number of vector elements"); + SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), + Subtarget, DAG, DL); + SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); + SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; + return DAG.getNode(ISD::VSELECT, DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), + ZeroVector); +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { - int NumElts = VT.getVectorNumElements(); - int NumEltsInLane = 128 / VT.getScalarSizeInBits(); - SmallVector<int, 8> Unpckl(NumElts); - SmallVector<int, 8> Unpckh(NumElts); - - for (int i = 0; i < NumElts; ++i) { - unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; - int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); - int HiPos = LoPos + NumEltsInLane / 2; - Unpckl[i] = LoPos; - Unpckh[i] = HiPos; - } - + SmallVector<int, 8> Unpckl; + createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + + SmallVector<int, 8> Unpckh; + createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); @@ -7401,19 +8248,14 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, /// one of the inputs being zeroable. static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SelectionDAG &DAG) { + assert(!VT.isFloatingPoint() && "Floating point types are not supported"); MVT EltVT = VT.getVectorElementType(); - int NumEltBits = EltVT.getSizeInBits(); - MVT IntEltVT = MVT::getIntegerVT(NumEltBits); - SDValue Zero = DAG.getConstant(0, DL, IntEltVT); - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, - IntEltVT); - if (EltVT.isFloatingPoint()) { - Zero = DAG.getBitcast(EltVT, Zero); - AllOnes = DAG.getBitcast(EltVT, AllOnes); - } + SDValue Zero = DAG.getConstant(0, DL, EltVT); + SDValue AllOnes = + DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT); SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Zeroable[i]) @@ -7431,10 +8273,7 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); - V = DAG.getNode(VT.isFloatingPoint() - ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, - DL, VT, V, VMask); - return V; + return DAG.getNode(ISD::AND, DL, VT, V, VMask); } /// \brief Try to emit a blend instruction for a shuffle using bit math. @@ -7476,12 +8315,12 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Original, + const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); SmallVector<int, 8> Mask(Original.begin(), Original.end()); - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); bool ForceV1Zero = false, ForceV2Zero = false; // Attempt to generate the binary blend mask. If an input is zero then @@ -7540,7 +8379,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); - // FALLTHROUGH + LLVM_FALLTHROUGH; case MVT::v2i64: case MVT::v4i32: // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into @@ -7556,7 +8395,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8))); } - // FALLTHROUGH + LLVM_FALLTHROUGH; case MVT::v8i16: { // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. @@ -7582,15 +8421,16 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } + LLVM_FALLTHROUGH; } - // FALLTHROUGH case MVT::v16i8: case MVT::v32i8: { assert((VT.is128BitVector() || Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. - if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) return Masked; // Scale the blend by the number of bytes per element. @@ -7704,32 +8544,12 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL, return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } -/// \brief Try to lower a vector shuffle as a byte rotation. -/// -/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary -/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use -/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will -/// try to generically lower a vector shuffle through such an pattern. It -/// does not check for the profitability of lowering either as PALIGNR or -/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. -/// This matches shuffle vectors that look like: -/// -/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// \brief Try to lower a vector shuffle as a rotation. /// -/// Essentially it concatenates V1 and V2, shifts right by some number of -/// elements, and takes the low elements as the result. Note that while this is -/// specified as a *right shift* because x86 is little-endian, it is a *left -/// rotate* of the vector lanes. -static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, - SDValue V1, SDValue V2, - ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); - +/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. +static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, + ArrayRef<int> Mask) { int NumElts = Mask.size(); - int NumLanes = VT.getSizeInBits() / 128; - int NumLaneElts = NumElts / NumLanes; // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] @@ -7740,51 +8560,46 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; - for (int l = 0; l < NumElts; l += NumLaneElts) { - for (int i = 0; i < NumLaneElts; ++i) { - if (Mask[l + i] < 0) - continue; - - // Get the mod-Size index and lane correct it. - int LaneIdx = (Mask[l + i] % NumElts) - l; - // Make sure it was in this lane. - if (LaneIdx < 0 || LaneIdx >= NumLaneElts) - return SDValue(); + for (int i = 0; i < NumElts; ++i) { + int M = Mask[i]; + assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && + "Unexpected mask index."); + if (M < 0) + continue; - // Determine where a rotated vector would have started. - int StartIdx = i - LaneIdx; - if (StartIdx == 0) - // The identity rotation isn't interesting, stop. - return SDValue(); + // Determine where a rotated vector would have started. + int StartIdx = i - (M % NumElts); + if (StartIdx == 0) + // The identity rotation isn't interesting, stop. + return -1; - // If we found the tail of a vector the rotation must be the missing - // front. If we found the head of a vector, it must be how much of the - // head. - int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; + // If we found the tail of a vector the rotation must be the missing + // front. If we found the head of a vector, it must be how much of the + // head. + int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx; - if (Rotation == 0) - Rotation = CandidateRotation; - else if (Rotation != CandidateRotation) - // The rotations don't match, so we can't match this mask. - return SDValue(); + if (Rotation == 0) + Rotation = CandidateRotation; + else if (Rotation != CandidateRotation) + // The rotations don't match, so we can't match this mask. + return -1; - // Compute which value this mask is pointing at. - SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; - - // Compute which of the two target values this index should be assigned - // to. This reflects whether the high elements are remaining or the low - // elements are remaining. - SDValue &TargetV = StartIdx < 0 ? Hi : Lo; - - // Either set up this value if we've not encountered it before, or check - // that it remains consistent. - if (!TargetV) - TargetV = MaskV; - else if (TargetV != MaskV) - // This may be a rotation, but it pulls from the inputs in some - // unsupported interleaving. - return SDValue(); - } + // Compute which value this mask is pointing at. + SDValue MaskV = M < NumElts ? V1 : V2; + + // Compute which of the two target values this index should be assigned + // to. This reflects whether the high elements are remaining or the low + // elements are remaining. + SDValue &TargetV = StartIdx < 0 ? Hi : Lo; + + // Either set up this value if we've not encountered it before, or check + // that it remains consistent. + if (!TargetV) + TargetV = MaskV; + else if (TargetV != MaskV) + // This may be a rotation, but it pulls from the inputs in some + // unsupported interleaving. + return -1; } // Check that we successfully analyzed the mask, and normalize the results. @@ -7795,23 +8610,75 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, else if (!Hi) Hi = Lo; + V1 = Lo; + V2 = Hi; + + return Rotation; +} + +/// \brief Try to lower a vector shuffle as a byte rotation. +/// +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will +/// try to generically lower a vector shuffle through such an pattern. It +/// does not check for the profitability of lowering either as PALIGNR or +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. +/// This matches shuffle vectors that look like: +/// +/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask) { + // Don't accept any shuffles with zero elements. + if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) + return -1; + + // PALIGNR works on 128-bit lanes. + SmallVector<int, 16> RepeatedMask; + if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) + return -1; + + int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask); + if (Rotation <= 0) + return -1; + + // PALIGNR rotates bytes, so we need to scale the + // rotation based on how many bytes are in the vector lane. + int NumElts = RepeatedMask.size(); + int Scale = 16 / NumElts; + return Rotation * Scale; +} + +static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + SDValue Lo = V1, Hi = V2; + int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask); + if (ByteRotation <= 0) + return SDValue(); + // Cast the inputs to i8 vector of correct length to match PALIGNR or // PSLLDQ/PSRLDQ. - MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); + MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); Lo = DAG.getBitcast(ByteVT, Lo); Hi = DAG.getBitcast(ByteVT, Hi); - // The actual rotate instruction rotates bytes, so we need to scale the - // rotation based on how many bytes are in the vector lane. - int Scale = 16 / NumLaneElts; - // SSSE3 targets can use the palignr instruction. if (Subtarget.hasSSSE3()) { assert((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, - DAG.getConstant(Rotation * Scale, DL, MVT::i8))); + DAG.getConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && @@ -7822,8 +8689,8 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation - int LoByteShift = 16 - Rotation * Scale; - int HiByteShift = Rotation * Scale; + int LoByteShift = 16 - ByteRotation; + int HiByteShift = ByteRotation; SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, DAG.getConstant(LoByteShift, DL, MVT::i8)); @@ -7833,6 +8700,37 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } +/// \brief Try to lower a vector shuffle as a dword/qword rotation. +/// +/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary +/// rotation of the concatenation of two vectors; This routine will +/// try to generically lower a vector shuffle through such an pattern. +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && + "Only 32-bit and 64-bit elements are supported!"); + + // 128/256-bit vectors are only supported with VLX. + assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) + && "VLX required for 128/256-bit vectors"); + + SDValue Lo = V1, Hi = V2; + int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask); + if (Rotation <= 0) + return SDValue(); + + return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, + DAG.getConstant(Rotation, DL, MVT::i8)); +} + /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -7856,14 +8754,13 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT, /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] -static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - +static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, + unsigned ScalarSizeInBits, + ArrayRef<int> Mask, int MaskOffset, + const SmallBitVector &Zeroable, + const X86Subtarget &Subtarget) { int Size = Mask.size(); - assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + unsigned SizeInBits = Size * ScalarSizeInBits; auto CheckZeros = [&](int Shift, int Scale, bool Left) { for (int i = 0; i < Size; i += Scale) @@ -7874,37 +8771,30 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, return true; }; - auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { + auto MatchShift = [&](int Shift, int Scale, bool Left) { for (int i = 0; i != Size; i += Scale) { unsigned Pos = Left ? i + Shift : i; unsigned Low = Left ? i : i + Shift; unsigned Len = Scale - Shift; - if (!isSequentialOrUndefInRange(Mask, Pos, Len, - Low + (V == V1 ? 0 : Size))) - return SDValue(); + if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset)) + return -1; } - int ShiftEltBits = VT.getScalarSizeInBits() * Scale; + int ShiftEltBits = ScalarSizeInBits * Scale; bool ByteShift = ShiftEltBits > 64; - unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) - : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); - int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); + Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) + : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); + int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1); // Normalize the scale for byte shifts to still produce an i64 element // type. Scale = ByteShift ? Scale / 2 : Scale; // We need to round trip through the appropriate type for the shift. - MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); - MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8) - : MVT::getVectorVT(ShiftSVT, Size / Scale); - assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && - "Illegal integer vector type"); - V = DAG.getBitcast(ShiftVT, V); - - V = DAG.getNode(OpCode, DL, ShiftVT, V, - DAG.getConstant(ShiftAmt, DL, MVT::i8)); - return DAG.getBitcast(VT, V); + MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); + ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) + : MVT::getVectorVT(ShiftSVT, Size / Scale); + return (int)ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just @@ -7913,29 +8803,64 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. - unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128); - for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2) + unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128); + for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) - if (CheckZeros(Shift, Scale, Left)) - for (SDValue V : {V1, V2}) - if (SDValue Match = MatchShift(Shift, Scale, Left, V)) - return Match; + if (CheckZeros(Shift, Scale, Left)) { + int ShiftAmt = MatchShift(Shift, Scale, Left); + if (0 < ShiftAmt) + return ShiftAmt; + } // no match - return SDValue(); + return -1; +} + +static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + int Size = Mask.size(); + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + MVT ShiftVT; + SDValue V = V1; + unsigned Opcode; + + // Try to match shuffle against V1 shift. + int ShiftAmt = matchVectorShuffleAsShift( + ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); + + // If V1 failed, try to match shuffle against V2 shift. + if (ShiftAmt < 0) { + ShiftAmt = + matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), + Mask, Size, Zeroable, Subtarget); + V = V2; + } + + if (ShiftAmt < 0) + return SDValue(); + + assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && + "Illegal integer vector type"); + V = DAG.getBitcast(ShiftVT, V); + V = DAG.getNode(Opcode, DL, ShiftVT, V, + DAG.getConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getBitcast(VT, V); } /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SelectionDAG &DAG) { - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - assert(!Zeroable.all() && "Fully zeroable shuffle mask"); - int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + assert(!Zeroable.all() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) @@ -8111,8 +9036,10 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( InputV = ShuffleOffset(InputV); // For 256-bit vectors, we only need the lower (128-bit) input half. - if (VT.is256BitVector()) - InputV = extract128BitVector(InputV, 0, DAG, DL); + // For 512-bit vectors, we only need the lower input half or quarter. + if (VT.getSizeInBits() > 128) + InputV = extractSubVector(InputV, 0, DAG, DL, + std::max(128, (int)VT.getSizeInBits() / Scale)); InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV); return DAG.getBitcast(VT, InputV); @@ -8231,9 +9158,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - + const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); @@ -8388,14 +9314,14 @@ static bool isShuffleFoldableLoad(SDValue V) { /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); - int V2Index = std::find_if(Mask.begin(), Mask.end(), - [&Mask](int M) { return M >= (int)Mask.size(); }) - - Mask.begin(); + int V2Index = + find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - + Mask.begin(); bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { @@ -8709,6 +9635,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, V = DAG.getBitcast(SrcVT, V); } + // 32-bit targets need to load i64 as a f64 and then bitcast the result. + if (!Subtarget.is64Bit() && SrcVT == MVT::i64) { + V = DAG.getBitcast(MVT::f64, V); + unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements(); + BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); + } + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } @@ -8726,71 +9659,93 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - unsigned ZMask = 0; - int V1DstIndex = -1; - int V2DstIndex = -1; - bool V1UsedInPlace = false; - for (int i = 0; i < 4; ++i) { - // Synthesize a zero mask from the zeroable elements (includes undefs). - if (Zeroable[i]) { - ZMask |= 1 << i; - continue; - } + // Attempt to match INSERTPS with one element from VA or VB being + // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask + // are updated. + auto matchAsInsertPS = [&](SDValue VA, SDValue VB, + ArrayRef<int> CandidateMask) { + unsigned ZMask = 0; + int VADstIndex = -1; + int VBDstIndex = -1; + bool VAUsedInPlace = false; + + for (int i = 0; i < 4; ++i) { + // Synthesize a zero mask from the zeroable elements (includes undefs). + if (Zeroable[i]) { + ZMask |= 1 << i; + continue; + } - // Flag if we use any V1 inputs in place. - if (i == Mask[i]) { - V1UsedInPlace = true; - continue; + // Flag if we use any VA inputs in place. + if (i == CandidateMask[i]) { + VAUsedInPlace = true; + continue; + } + + // We can only insert a single non-zeroable element. + if (VADstIndex >= 0 || VBDstIndex >= 0) + return false; + + if (CandidateMask[i] < 4) { + // VA input out of place for insertion. + VADstIndex = i; + } else { + // VB input for insertion. + VBDstIndex = i; + } } - // We can only insert a single non-zeroable element. - if (V1DstIndex >= 0 || V2DstIndex >= 0) + // Don't bother if we have no (non-zeroable) element for insertion. + if (VADstIndex < 0 && VBDstIndex < 0) return false; - if (Mask[i] < 4) { - // V1 input out of place for insertion. - V1DstIndex = i; + // Determine element insertion src/dst indices. The src index is from the + // start of the inserted vector, not the start of the concatenated vector. + unsigned VBSrcIndex = 0; + if (VADstIndex >= 0) { + // If we have a VA input out of place, we use VA as the V2 element + // insertion and don't use the original V2 at all. + VBSrcIndex = CandidateMask[VADstIndex]; + VBDstIndex = VADstIndex; + VB = VA; } else { - // V2 input for insertion. - V2DstIndex = i; + VBSrcIndex = CandidateMask[VBDstIndex] - 4; } - } - // Don't bother if we have no (non-zeroable) element for insertion. - if (V1DstIndex < 0 && V2DstIndex < 0) - return false; + // If no V1 inputs are used in place, then the result is created only from + // the zero mask and the V2 insertion - so remove V1 dependency. + if (!VAUsedInPlace) + VA = DAG.getUNDEF(MVT::v4f32); - // Determine element insertion src/dst indices. The src index is from the - // start of the inserted vector, not the start of the concatenated vector. - unsigned V2SrcIndex = 0; - if (V1DstIndex >= 0) { - // If we have a V1 input out of place, we use V1 as the V2 element insertion - // and don't use the original V2 at all. - V2SrcIndex = Mask[V1DstIndex]; - V2DstIndex = V1DstIndex; - V2 = V1; - } else { - V2SrcIndex = Mask[V2DstIndex] - 4; - } + // Update V1, V2 and InsertPSMask accordingly. + V1 = VA; + V2 = VB; - // If no V1 inputs are used in place, then the result is created only from - // the zero mask and the V2 insertion - so remove V1 dependency. - if (!V1UsedInPlace) - V1 = DAG.getUNDEF(MVT::v4f32); + // Insert the V2 element into the desired position. + InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + return true; + }; - // Insert the V2 element into the desired position. - InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; - assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); - return true; + if (matchAsInsertPS(V1, V2, Mask)) + return true; + + // Commute and try again. + SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end()); + ShuffleVectorSDNode::commuteMask(CommutedMask); + if (matchAsInsertPS(V2, V1, CommutedMask)) + return true; + + return false; } static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); // Attempt to match the insertps pattern. unsigned InsertPSMask; @@ -8922,6 +9877,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -8946,8 +9902,11 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + return DAG.getNode( + X86ISD::SHUFP, DL, MVT::v2f64, + Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, + Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); @@ -8955,14 +9914,14 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) + DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) + DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; } @@ -8980,7 +9939,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (Subtarget.hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. @@ -9000,6 +9959,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -9052,19 +10012,19 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) + DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // We have different paths for blend lowering, but they all must use the @@ -9072,7 +10032,7 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. @@ -9139,9 +10099,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { - int V2Index = - std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - - Mask.begin(); + int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. @@ -9220,6 +10178,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -9262,17 +10221,18 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerVectorShuffleAsElementInsertion( + DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. - if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG)) + if (SDValue V = + lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) @@ -9301,6 +10261,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -9311,8 +10272,8 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); @@ -9341,13 +10302,13 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerVectorShuffleAsElementInsertion( + DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the @@ -9355,11 +10316,11 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; - if (SDValue Masked = - lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -9374,26 +10335,31 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; - // If we have direct support for blends, we should lower by decomposing into - // a permute. That will be faster than the domain cross. - if (IsBlendSupported) - return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, - Mask, DAG); - - // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, - V2, Mask, DAG)) - return Unpack; + // Assume that a single SHUFPS is faster than an alternative sequence of + // multiple instructions (even if the CPU has a domain penalty). + // If some CPU is harmed by the domain switch, we can fix it in a later pass. + if (!isSingleSHUFPSMask(Mask)) { + // If we have direct support for blends, we should lower by decomposing into + // a permute. That will be faster than the domain cross. + if (IsBlendSupported) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, + Mask, DAG); + + // Try to lower by permuting the inputs into an unpack instruction. + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Unpack; + } // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. - return DAG.getBitcast( - MVT::v4i32, - DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1), - DAG.getBitcast(MVT::v4f32, V2), Mask)); + SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); + SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2); + SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask); + return DAG.getBitcast(MVT::v4i32, ShufPS); } /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 @@ -9551,18 +10517,15 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef<int> Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. - bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), - PinnedIdx ^ 1) != Inputs.end(); + bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); - bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), - FixFreeIdx) != Inputs.end(); + bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; - IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), - FixFreeIdx) != Inputs.end(); + IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; @@ -9734,9 +10697,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { - int InputFixed = std::find(std::begin(SourceHalfMask), - std::end(SourceHalfMask), -1) - - std::begin(SourceHalfMask) + SourceOffset; + int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) + + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], @@ -9868,8 +10830,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( /// blend if only one input is used. static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse, + bool &V2InUse) { SDValue V1Mask[16]; SDValue V2Mask[16]; V1InUse = false; @@ -9929,6 +10891,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -9939,7 +10902,7 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); @@ -9952,7 +10915,7 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // Use dedicated unpack instructions for masks that match their pattern. @@ -9978,18 +10941,19 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerVectorShuffleAsElementInsertion( + DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the @@ -9997,11 +10961,11 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; - if (SDValue Masked = - lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -10027,14 +10991,14 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; - return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG, - V1InUse, V2InUse); + return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, - Mask, DAG); + Mask, DAG); } /// \brief Check whether a compaction lowering can be done by dropping even @@ -10111,6 +11075,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10120,7 +11085,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -10130,12 +11095,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use a zext lowering. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) - if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); @@ -10238,8 +11204,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; } - if (SDValue Masked = - lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -10265,15 +11231,15 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, bool V2InUse = false; SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs( - DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse); + DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, // do so. This avoids using them to handle blends-with-zero which is // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget.hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend( + DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some @@ -10294,8 +11260,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue V = lowerVectorShuffleAsElementInsertion( + DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue BitBlend = @@ -10349,22 +11315,18 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // with a pack. SDValue V = V1; - int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; + std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; - SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); - SDValue VLoHalf, VHiHalf; // Check if any of the odd lanes in the v16i8 are used. If not, we can mask // them out and avoid using UNPCK{L,H} to extract the elements of V as // i16s. - if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask), - [](int M) { return M >= 0 && M % 2 == 1; }) && - std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), - [](int M) { return M >= 0 && M % 2 == 1; })) { + if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) && + none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. VLoHalf = DAG.getBitcast(MVT::v8i16, V); VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, @@ -10383,6 +11345,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } else { // Otherwise just unpack the low half of V into VLoHalf and the high half into // VHiHalf so that we can blend them as i16s. + SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL); + VLoHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); VHiHalf = DAG.getBitcast( @@ -10401,83 +11365,28 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, + const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: - return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v2f64: - return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i32: - return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4f32: - return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: - return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: - return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); } } -/// \brief Helper function to test whether a shuffle mask could be -/// simplified by widening the elements being shuffled. -/// -/// Appends the mask for wider elements in WidenedMask if valid. Otherwise -/// leaves it in an unspecified state. -/// -/// NOTE: This must handle normal vector shuffle masks and *target* vector -/// shuffle masks. The latter have the special property of a '-2' representing -/// a zero-ed lane of a vector. -static bool canWidenShuffleElements(ArrayRef<int> Mask, - SmallVectorImpl<int> &WidenedMask) { - WidenedMask.assign(Mask.size() / 2, 0); - for (int i = 0, Size = Mask.size(); i < Size; i += 2) { - // If both elements are undef, its trivial. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { - WidenedMask[i/2] = SM_SentinelUndef; - continue; - } - - // Check for an undef mask and a mask value properly aligned to fit with - // a pair of values. If we find such a case, use the non-undef mask's value. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { - WidenedMask[i/2] = Mask[i + 1] / 2; - continue; - } - if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { - WidenedMask[i/2] = Mask[i] / 2; - continue; - } - - // When zeroing, we need to spread the zeroing across both lanes to widen. - if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { - if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && - (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { - WidenedMask[i/2] = SM_SentinelZero; - continue; - } - return false; - } - - // Finally check if the two mask values are adjacent and aligned with - // a pair. - if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { - WidenedMask[i/2] = Mask[i] / 2; - continue; - } - - // Otherwise we can't safely widen the elements used in this shuffle. - return false; - } - assert(WidenedMask.size() == Mask.size() / 2 && - "Incorrect size of mask after widening the elements!"); - - return true; -} - /// \brief Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and @@ -10712,15 +11621,20 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, /// \brief Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + SmallVector<int, 4> WidenedMask; + if (!canWidenShuffleElements(Mask, WidenedMask)) + return SDValue(); + // TODO: If minimizing size and one of the inputs is a zero vector and the // the zero vector has only one use, we could use a VPERM2X128 to save the // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); @@ -10761,15 +11675,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, // [6] - ignore // [7] - zero high half of destination - int MaskLO = Mask[0]; - if (MaskLO == SM_SentinelUndef) - MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; - - int MaskHI = Mask[2]; - if (MaskHI == SM_SentinelUndef) - MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; + int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0]; + int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1]; - unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; + unsigned PermMask = MaskLO | (MaskHI << 4); // If either input is a zero vector, replace it with an undef input. // Shuffle mask values < 4 are selecting elements of V1. @@ -10778,16 +11687,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, // selecting the zero vector and setting the zero mask bit. if (IsV1Zero) { V1 = DAG.getUNDEF(VT); - if (MaskLO < 4) + if (MaskLO < 2) PermMask = (PermMask & 0xf0) | 0x08; - if (MaskHI < 4) + if (MaskHI < 2) PermMask = (PermMask & 0x0f) | 0x80; } if (IsV2Zero) { V2 = DAG.getUNDEF(VT); - if (MaskLO >= 4) + if (MaskLO >= 2) PermMask = (PermMask & 0xf0) | 0x08; - if (MaskHI >= 4) + if (MaskHI >= 2) PermMask = (PermMask & 0x0f) | 0x80; } @@ -11178,35 +12087,65 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( SubLaneMask); } -static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { +static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, + unsigned &ShuffleImm, + ArrayRef<int> Mask) { + int NumElts = VT.getVectorNumElements(); + assert(VT.getScalarType() == MVT::f64 && + (NumElts == 2 || NumElts == 4 || NumElts == 8) && + "Unexpected data type for VSHUFPD"); // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. - assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD"); - int NumElts = VT.getVectorNumElements(); + ShuffleImm = 0; bool ShufpdMask = true; bool CommutableMask = true; - unsigned Immediate = 0; for (int i = 0; i < NumElts; ++i) { - if (Mask[i] < 0) + if (Mask[i] == SM_SentinelUndef) continue; + if (Mask[i] < 0) + return false; int Val = (i & 6) + NumElts * (i & 1); - int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1); - if (Mask[i] < Val || Mask[i] > Val + 1) + int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1); + if (Mask[i] < Val || Mask[i] > Val + 1) ShufpdMask = false; - if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) + if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) CommutableMask = false; - Immediate |= (Mask[i] % 2) << i; + ShuffleImm |= (Mask[i] % 2) << i; } + if (ShufpdMask) - return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, - DAG.getConstant(Immediate, DL, MVT::i8)); - if (CommutableMask) - return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, - DAG.getConstant(Immediate, DL, MVT::i8)); - return SDValue(); + return true; + if (CommutableMask) { + std::swap(V1, V2); + return true; + } + + return false; +} + +static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + unsigned Immediate = 0; + if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) + return SDValue(); + + return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, + DAG.getConstant(Immediate, DL, MVT::i8)); +} + +static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); + if (V2.isUndef()) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); } /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. @@ -11214,6 +12153,7 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11221,11 +12161,9 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - SmallVector<int, 4> WidenedMask; - if (canWidenShuffleElements(Mask, WidenedMask)) - if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, - Subtarget, DAG)) - return V; + if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return V; if (V2.isUndef()) { // Check for being able to broadcast a single element. @@ -11268,7 +12206,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. @@ -11280,7 +12218,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // the results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) - return V; + return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, @@ -11291,6 +12229,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. @@ -11307,6 +12250,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11315,14 +12259,12 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); - SmallVector<int, 4> WidenedMask; - if (canWidenShuffleElements(Mask, WidenedMask)) - if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, - Subtarget, DAG)) - return V; + if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. @@ -11352,9 +12294,25 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; + // If we have VLX support, we can use VALIGN or VEXPAND. + if (Subtarget.hasVLX()) { + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + + // Try to use PALIGNR. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) @@ -11364,8 +12322,8 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. - if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || - isShuffleMaskInputInPlace(1, Mask)))) + if (!isShuffleMaskInputInPlace(0, Mask) && + !isShuffleMaskInputInPlace(1, Mask)) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; @@ -11380,6 +12338,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11388,7 +12347,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. @@ -11432,17 +12391,12 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (V2.isUndef()) { - SDValue VPermMask[8]; - for (int i = 0; i < 8; ++i) - VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) - : DAG.getConstant(Mask[i], DL, MVT::i32); + SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) - return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, - DAG.getBuildVector(MVT::v8i32, DL, VPermMask)); + return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask); if (Subtarget.hasAVX2()) - return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, - DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1); + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -11454,6 +12408,11 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. + if (Subtarget.hasVLX()) + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. @@ -11470,6 +12429,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11481,12 +12441,12 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. @@ -11498,7 +12458,9 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // efficient instructions that mirror the shuffles across the two 128-bit // lanes. SmallVector<int, 4> RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { + bool Is128BitLaneRepeatedShuffle = + is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask); + if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, @@ -11512,16 +12474,27 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; + // If we have VLX support, we can use VALIGN or EXPAND. + if (Subtarget.hasVLX()) { + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + } + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the - // the results into the target lanes. + // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; @@ -11529,12 +12502,19 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (V2.isUndef()) { - SDValue VPermMask[8]; - for (int i = 0; i < 8; ++i) - VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) - : DAG.getConstant(Mask[i], DL, MVT::i32); - return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, - DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1); + SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); + } + + // Assume that a single SHUFPS is faster than an alternative sequence of + // multiple instructions (even if the CPU has a domain penalty). + // If some CPU is harmed by the domain switch, we can fix it in a later pass. + if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { + SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); + SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); + SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, + CastV1, CastV2, DAG); + return DAG.getBitcast(MVT::v8i32, ShufPS); } // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -11553,6 +12533,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11564,8 +12545,8 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. @@ -11574,7 +12555,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return Broadcast; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. @@ -11584,7 +12565,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -11615,10 +12596,14 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } } - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, - V2, Subtarget, DAG)) + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( + DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; + // AVX512BWVL can lower to VPERMW. + if (Subtarget.hasBWI() && Subtarget.hasVLX()) + return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG); + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -11634,6 +12619,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11645,8 +12631,8 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, - Mask, Subtarget, DAG)) + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. @@ -11655,7 +12641,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return Broadcast; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. @@ -11665,7 +12651,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -11685,8 +12671,8 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, - V2, Subtarget, DAG)) + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( + DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -11706,6 +12692,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// together based on the available instructions. static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, + const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we @@ -11715,7 +12702,7 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - DL, VT, V1, V2, Mask, Subtarget, DAG)) + DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. @@ -11734,7 +12721,8 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (ElementBits < 32) { // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. - if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + if (SDValue V = + lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) return V; if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; @@ -11750,17 +12738,17 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, switch (VT.SimpleTy) { case MVT::v4f64: - return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i64: - return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f32: - return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i32: - return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i16: - return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i8: - return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); @@ -11782,57 +12770,81 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); + // Check for patterns which can be matched with a single insert of a 256-bit + // subvector. + bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, + {0, 1, 2, 3, 0, 1, 2, 3}); + if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, + {0, 1, 2, 3, 8, 9, 10, 11})) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0, DL)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + OnlyUsesV1 ? V1 : V2, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + + assert(WidenedMask.size() == 4); + + // See if this is an insertion of the lower 128-bits of V2 into V1. + bool IsInsert = true; + int V2Index = -1; + for (int i = 0; i < 4; ++i) { + assert(WidenedMask[i] >= -1); + if (WidenedMask[i] < 0) + continue; + + // Make sure all V1 subvectors are in place. + if (WidenedMask[i] < 4) { + if (WidenedMask[i] != i) { + IsInsert = false; + break; + } + } else { + // Make sure we only have a single V2 index and its the lowest 128-bits. + if (V2Index >= 0 || WidenedMask[i] != 4) { + IsInsert = false; + break; + } + V2Index = i; + } + } + if (IsInsert && V2Index >= 0) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); + SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, + DAG.getIntPtrConstant(0, DL)); + return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); + } + + // Try to lower to to vshuf64x2/vshuf32x4. SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; + unsigned PermMask = 0; // Insure elements came from the same Op. - int MaxOp1Index = VT.getVectorNumElements()/2 - 1; - for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { - if (WidenedMask[i] == SM_SentinelZero) - return SDValue(); - if (WidenedMask[i] == SM_SentinelUndef) + for (int i = 0; i < 4; ++i) { + assert(WidenedMask[i] >= -1); + if (WidenedMask[i] < 0) continue; - SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1; - unsigned OpIndex = (i < Size/2) ? 0 : 1; + SDValue Op = WidenedMask[i] >= 4 ? V2 : V1; + unsigned OpIndex = i / 2; if (Ops[OpIndex].isUndef()) Ops[OpIndex] = Op; else if (Ops[OpIndex] != Op) return SDValue(); - } - - // Form a 128-bit permutation. - // Convert the 64-bit shuffle mask selection values into 128-bit selection - // bits defined by a vshuf64x2 instruction's immediate control byte. - unsigned PermMask = 0, Imm = 0; - unsigned ControlBitsNum = WidenedMask.size() / 2; - for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { - // Use first element in place of undef mask. - Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; - PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); + // Convert the 128-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + PermMask |= (WidenedMask[i] % 4) << (i * 2); } return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], DAG.getConstant(PermMask, DL, MVT::i8)); } -static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - - assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); - - MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); - - SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); - if (V2.isUndef()) - return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); - - return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); -} - /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11875,11 +12887,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Op; + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11911,12 +12928,17 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11951,18 +12973,33 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; + // Try to use VALIGN. + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + + // Try to use PALIGNR. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Unpck = lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, + V2, DAG, Subtarget)) + return V; return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11970,11 +13007,20 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return ZExt; + // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the four 128-bit // lanes. SmallVector<int, 4> RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) { + bool Is128BitLaneRepeatedShuffle = + is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask); + if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, @@ -11988,20 +13034,40 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; + // Try to use VALIGN. + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // Assume that a single SHUFPS is faster than using a permv shuffle. + // If some CPU is harmed by the domain switch, we can fix it in a later pass. + if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { + SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); + SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); + SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, + CastV1, CastV2, DAG); + return DAG.getBitcast(MVT::v16i32, ShufPS); + } + // If we have AVX512F support, we can use VEXPAND. + if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, + V1, V2, DAG, Subtarget)) + return V; + return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12010,6 +13076,13 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return ZExt; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) @@ -12017,7 +13090,7 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -12041,6 +13114,7 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// \brief Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12049,6 +13123,13 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return ZExt; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) @@ -12056,7 +13137,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, - Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -12064,10 +13145,20 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; - if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, - V2, Subtarget, DAG)) + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( + DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; + // VBMI can use VPERMV/VPERMV3 byte shuffles. + if (Subtarget.hasVBMI()) + return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); + + // Try to create an in-lane repeating shuffle mask and then shuffle the + // the results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) + return V; + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } @@ -12079,11 +13170,22 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// together based on the available instructions. static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, + const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumElts = Mask.size(); + int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); + + if (NumV2Elements == 1 && Mask[0] >= NumElts) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return Insertion; + // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) @@ -12095,17 +13197,17 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: - return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: - return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i16: - return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i8: - return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); @@ -12161,9 +13263,81 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); else V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); + + SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask); + // i1 was sign extended we can use X86ISD::CVT2MASK. + int NumElems = VT.getVectorNumElements(); + if ((Subtarget.hasBWI() && (NumElems >= 32)) || + (Subtarget.hasDQI() && (NumElems < 32))) + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle); + + return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); } + +/// Helper function that returns true if the shuffle mask should be +/// commuted to improve canonicalization. +static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) { + int NumElements = Mask.size(); + + int NumV1Elements = 0, NumV2Elements = 0; + for (int M : Mask) + if (M < 0) + continue; + else if (M < NumElements) + ++NumV1Elements; + else + ++NumV2Elements; + + // Commute the shuffle as needed such that more elements come from V1 than + // V2. This allows us to match the shuffle pattern strictly on how many + // elements come from V1 without handling the symmetric cases. + if (NumV2Elements > NumV1Elements) + return true; + + assert(NumV1Elements > 0 && "No V1 indices"); + + if (NumV2Elements == 0) + return false; + + // When the number of V1 and V2 elements are the same, try to minimize the + // number of uses of V2 in the low half of the vector. When that is tied, + // ensure that the sum of indices for V1 is equal to or lower than the sum + // indices for V2. When those are equal, try to ensure that the number of odd + // indices for V1 is lower than the number of odd indices for V2. + if (NumV1Elements == NumV2Elements) { + int LowV1Elements = 0, LowV2Elements = 0; + for (int M : Mask.slice(0, NumElements / 2)) + if (M >= NumElements) + ++LowV2Elements; + else if (M >= 0) + ++LowV1Elements; + if (LowV2Elements > LowV1Elements) + return true; + if (LowV2Elements == LowV1Elements) { + int SumV1Indices = 0, SumV2Indices = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= NumElements) + SumV2Indices += i; + else if (Mask[i] >= 0) + SumV1Indices += i; + if (SumV2Indices < SumV1Indices) + return true; + if (SumV2Indices == SumV1Indices) { + int NumV1OddIndices = 0, NumV2OddIndices = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= NumElements) + NumV2OddIndices += i % 2; + else if (Mask[i] >= 0) + NumV1OddIndices += i % 2; + if (NumV2OddIndices < NumV1OddIndices) + return true; + } + } + } + + return false; +} + /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -12209,6 +13383,12 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); } + // Check for illegal shuffle mask element index values. + int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; + assert(llvm::all_of(Mask, + [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && + "Out of bounds shuffle index"); + // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. @@ -12237,69 +13417,22 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, } } - int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; - for (int M : Mask) - if (M < 0) - ++NumUndefElements; - else if (M < NumElements) - ++NumV1Elements; - else - ++NumV2Elements; - - // Commute the shuffle as needed such that more elements come from V1 than - // V2. This allows us to match the shuffle pattern strictly on how many - // elements come from V1 without handling the symmetric cases. - if (NumV2Elements > NumV1Elements) + // Commute the shuffle if it will improve canonicalization. + if (canonicalizeShuffleMaskWithCommute(Mask)) return DAG.getCommutedVectorShuffle(*SVOp); - assert(NumV1Elements > 0 && "No V1 indices"); - assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used"); - - // When the number of V1 and V2 elements are the same, try to minimize the - // number of uses of V2 in the low half of the vector. When that is tied, - // ensure that the sum of indices for V1 is equal to or lower than the sum - // indices for V2. When those are equal, try to ensure that the number of odd - // indices for V1 is lower than the number of odd indices for V2. - if (NumV1Elements == NumV2Elements) { - int LowV1Elements = 0, LowV2Elements = 0; - for (int M : Mask.slice(0, NumElements / 2)) - if (M >= NumElements) - ++LowV2Elements; - else if (M >= 0) - ++LowV1Elements; - if (LowV2Elements > LowV1Elements) - return DAG.getCommutedVectorShuffle(*SVOp); - if (LowV2Elements == LowV1Elements) { - int SumV1Indices = 0, SumV2Indices = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= NumElements) - SumV2Indices += i; - else if (Mask[i] >= 0) - SumV1Indices += i; - if (SumV2Indices < SumV1Indices) - return DAG.getCommutedVectorShuffle(*SVOp); - if (SumV2Indices == SumV1Indices) { - int NumV1OddIndices = 0, NumV2OddIndices = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= NumElements) - NumV2OddIndices += i % 2; - else if (Mask[i] >= 0) - NumV1OddIndices += i % 2; - if (NumV2OddIndices < NumV1OddIndices) - return DAG.getCommutedVectorShuffle(*SVOp); - } - } - } - // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) - return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); + return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, + DAG); if (VT.is256BitVector()) - return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); + return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, + DAG); if (VT.is512BitVector()) - return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); + return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, + DAG); if (Is1BitVector) return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); @@ -12392,21 +13525,6 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } - if (VT.getSizeInBits() == 16) { - // If Idx is 0, it's cheaper to do a move instead of a pextrw. - if (isNullConstant(Op.getOperand(1))) - return DAG.getNode( - ISD::TRUNCATE, dl, MVT::i16, - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), - Op.getOperand(1))); - SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); - SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); - } - if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the @@ -12432,6 +13550,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { if (isa<ConstantSDNode>(Op.getOperand(1))) return Op; } + return SDValue(); } @@ -12460,7 +13579,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const } unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) { + if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) || + (VecVT.getVectorNumElements() < 8)) { // Use kshiftlw/rw instruction. VecVT = MVT::v16i1; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, @@ -12469,8 +13589,9 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const DAG.getIntPtrConstant(0, dl)); } unsigned MaxSift = VecVT.getVectorNumElements() - 1; - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, - DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); + if (MaxSift - IdxVal) + Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, @@ -12491,10 +13612,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (!isa<ConstantSDNode>(Idx)) { if (VecVT.is512BitVector() || (VecVT.is256BitVector() && Subtarget.hasInt256() && - VecVT.getVectorElementType().getSizeInBits() == 32)) { + VecVT.getScalarSizeInBits() == 32)) { MVT MaskEltVT = - MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); + MVT::getIntegerVT(VecVT.getScalarSizeInBits()); MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / MaskEltVT.getSizeInBits()); @@ -12531,26 +13652,31 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, assert(VecVT.is128BitVector() && "Unexpected vector length"); - if (Subtarget.hasSSE41()) - if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) - return Res; - MVT VT = Op.getSimpleValueType(); - // TODO: handle v16i8. + if (VT.getSizeInBits() == 16) { - if (IdxVal == 0) + // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless + // we're going to zero extend the register or fold the store (SSE41 only). + if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) && + !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); // Transform it so it match pextrw which produces a 32-bit result. - MVT EltVT = MVT::i32; - SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx); - SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } + if (Subtarget.hasSSE41()) + if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) + return Res; + + // TODO: handle v16i8. + if (VT.getSizeInBits() == 32) { if (IdxVal == 0) return Op; @@ -12604,12 +13730,46 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); - if (IdxVal) + unsigned NumElems = VecVT.getVectorNumElements(); + + if(Vec.isUndef()) { + if (IdxVal) + EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + return EltInVec; + } + + // Insertion of one bit into first or last position + // can be done with two SHIFTs + OR. + if (IdxVal == 0 ) { + // EltInVec already at correct index and other bits are 0. + // Clean the first bit in source vector. + Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + DAG.getConstant(1 , dl, MVT::i8)); + Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + DAG.getConstant(1, dl, MVT::i8)); + + return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); + } + if (IdxVal == NumElems -1) { + // Move the bit to the last position inside the vector. EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); - if (Vec.isUndef()) - return EltInVec; - return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); + // Clean the last bit in the source vector. + Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + DAG.getConstant(1, dl, MVT::i8)); + Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + DAG.getConstant(1 , dl, MVT::i8)); + + return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); + } + + // Use shuffle to insert element. + SmallVector<int, 64> MaskVec(NumElems); + for (unsigned i = 0; i != NumElems; ++i) + MaskVec[i] = (i == IdxVal) ? NumElems : i; + + return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, @@ -12764,10 +13924,6 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } - if (OpVT == MVT::v1i64 && - Op.getOperand(0).getValueType() == MVT::i64) - return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); - SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); assert(OpVT.is128BitVector() && "Expected an SSE type!"); return DAG.getBitcast( @@ -12779,25 +13935,32 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // upper bits of a vector. static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX"); + SDLoc dl(Op); SDValue In = Op.getOperand(0); SDValue Idx = Op.getOperand(1); unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - MVT ResVT = Op.getSimpleValueType(); - MVT InVT = In.getSimpleValueType(); + MVT ResVT = Op.getSimpleValueType(); - if (Subtarget.hasFp256()) { - if (ResVT.is128BitVector() && - (InVT.is256BitVector() || InVT.is512BitVector()) && - isa<ConstantSDNode>(Idx)) { - return extract128BitVector(In, IdxVal, DAG, dl); - } - if (ResVT.is256BitVector() && InVT.is512BitVector() && - isa<ConstantSDNode>(Idx)) { - return extract256BitVector(In, IdxVal, DAG, dl); - } - } - return SDValue(); + assert((In.getSimpleValueType().is256BitVector() || + In.getSimpleValueType().is512BitVector()) && + "Can only extract from 256-bit or 512-bit vectors"); + + if (ResVT.is128BitVector()) + return extract128BitVector(In, IdxVal, DAG, dl); + if (ResVT.is256BitVector()) + return extract256BitVector(In, IdxVal, DAG, dl); + + llvm_unreachable("Unimplemented!"); +} + +static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) { + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) + if (llvm::all_of(ValidUsers, + [&I](SDValue V) { return V.getNode() != *I; })) + return false; + return true; } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a @@ -12805,58 +13968,97 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (!Subtarget.hasAVX()) - return SDValue(); + assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX"); SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); - if (!isa<ConstantSDNode>(Idx)) - return SDValue(); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); - // Fold two 16-byte subvector loads into one 32-byte load: - // (insert_subvector (insert_subvector undef, (load addr), 0), - // (load addr + 16), Elts/2) + if (OpVT.getVectorElementType() == MVT::i1) + return insert1BitVector(Op, DAG, Subtarget); + + assert((OpVT.is256BitVector() || OpVT.is512BitVector()) && + "Can only insert into 256-bit or 512-bit vectors"); + + // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte + // load: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr + 16), Elts/2) // --> load32 addr + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr + 32), Elts/2) + // --> load64 addr + // or a 16-byte or 32-byte broadcast: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr), Elts/2) + // --> X86SubVBroadcast(load16 addr) + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr), Elts/2) + // --> X86SubVBroadcast(load32 addr) if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); // If needed, look through bitcasts to get to the load. - SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1)); - if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) { + if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { bool Fast; unsigned Alignment = FirstLd->getAlignment(); unsigned AS = FirstLd->getAddressSpace(); const X86TargetLowering *TLI = Subtarget.getTargetLowering(); if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), OpVT, AS, Alignment, &Fast) && Fast) { - SDValue Ops[] = { SubVec2, SubVec }; + SDValue Ops[] = {SubVec2, SubVec}; if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) return Ld; } } + // If lower/upper loads are the same and the only users of the load, then + // lower to a VBROADCASTF128/VBROADCASTI128/etc. + if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) { + if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && + areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); + } + } + // If this is subv_broadcast insert into both halves, use a larger + // subv_broadcast. + if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, + SubVec.getOperand(0)); + } } } - if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && - SubVecVT.is128BitVector()) + if (SubVecVT.is128BitVector()) return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); - if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) + if (SubVecVT.is256BitVector()) return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - if (OpVT.getVectorElementType() == MVT::i1) - return insert1BitVector(Op, DAG, Subtarget); + llvm_unreachable("Unimplemented!"); +} - return SDValue(); +// Returns the appropriate wrapper opcode for a global reference. +unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const { + // References to absolute symbols are never PC-relative. + if (GV && GV->isAbsoluteSymbolRef()) + return X86ISD::Wrapper; + + CodeModel::Model M = getTargetMachine().getCodeModel(); + if (Subtarget.isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + return X86ISD::WrapperRIP; + + return X86ISD::Wrapper; } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as @@ -12872,18 +14074,12 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); - unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = DAG.getTarget().getCodeModel(); - - if (Subtarget.isPICStyleRIPRel() && - (M == CodeModel::Small || M == CodeModel::Kernel)) - WrapperKind = X86ISD::WrapperRIP; auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetConstantPool( CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); SDLoc DL(CP); - Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); + Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = @@ -12900,17 +14096,11 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); - unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = DAG.getTarget().getCodeModel(); - - if (Subtarget.isPICStyleRIPRel() && - (M == CodeModel::Small || M == CodeModel::Kernel)) - WrapperKind = X86ISD::WrapperRIP; auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); SDLoc DL(JT); - Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); + Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) @@ -12929,18 +14119,12 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // global base reg. const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod); - unsigned WrapperKind = X86ISD::Wrapper; - CodeModel::Model M = DAG.getTarget().getCodeModel(); - - if (Subtarget.isPICStyleRIPRel() && - (M == CodeModel::Small || M == CodeModel::Kernel)) - WrapperKind = X86ISD::WrapperRIP; auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); SDLoc DL(Op); - Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); + Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isPositionIndependent() && !Subtarget.is64Bit()) { @@ -12963,18 +14147,12 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget.classifyBlockAddressReference(); - CodeModel::Model M = DAG.getTarget().getCodeModel(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); - - if (Subtarget.isPICStyleRIPRel() && - (M == CodeModel::Small || M == CodeModel::Kernel)) - Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); - else - Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); + Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { @@ -13003,11 +14181,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); } - if (Subtarget.isPICStyleRIPRel() && - (M == CodeModel::Small || M == CodeModel::Kernel)) - Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); - else - Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); + Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { @@ -13041,7 +14215,7 @@ static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic = false) { - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDLoc dl(GA); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, @@ -13061,8 +14235,8 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. - MFI->setAdjustsStack(true); - MFI->setHasCalls(true); + MFI.setAdjustsStack(true); + MFI.setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); @@ -13097,7 +14271,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SDLoc dl(GA); // Get the start address of the TLS block for this module. - X86MachineFunctionInfo* MFI = DAG.getMachineFunction() + X86MachineFunctionInfo *MFI = DAG.getMachineFunction() .getInfo<X86MachineFunctionInfo>(); MFI->incNumLocalDynamicTLSAccesses(); @@ -13251,8 +14425,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Chain.getValue(1), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - MFI->setAdjustsStack(true); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + MFI.setAdjustsStack(true); // And our return value (tls address) is in the standard call return value // location. @@ -13395,9 +14569,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { - return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, + return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, - DAG.getUNDEF(SrcVT))); + DAG.getUNDEF(SrcVT))); } if (SrcVT.getVectorElementType() == MVT::i1) { if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT)) @@ -13433,7 +14607,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); - int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); + int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( DAG.getEntryNode(), dl, ValueToStore, StackSlot, @@ -13479,8 +14653,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, // shouldn't be necessary except that RFP cannot be live across // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); - unsigned SSFISize = Op.getValueType().getSizeInBits()/8; - int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); + unsigned SSFISize = Op.getValueSizeInBits()/8; + int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); @@ -13528,10 +14702,10 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SmallVector<Constant*,2> CV1; CV1.push_back( - ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)))); CV1.push_back( - ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); @@ -13560,8 +14734,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub); - SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, - S2F, 0x4E, DAG); + SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1}); Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, DAG.getBitcast(MVT::v2f64, Shuffle), Sub); } @@ -13617,6 +14790,41 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, return Sub; } +static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget, SDLoc &DL) { + if (Op.getSimpleValueType() != MVT::v2f64) + return SDValue(); + + SDValue N0 = Op.getOperand(0); + assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); + + // Legalize to v4i32 type. + N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getUNDEF(MVT::v2i32)); + + if (Subtarget.hasAVX512()) + return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); + + // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT, + // but using v2i32 to v2f64 with X86ISD::CVTSI2P. + SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32); + SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); + + // Two to the power of half-word-size. + SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64); + + // Clear upper part of LO, lower HI. + SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); + SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask); + + SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI); + fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW); + SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO); + + // Add the two halves. + return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO); +} + static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // The algorithm is the following: @@ -13699,7 +14907,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). SDValue VecCstFAdd = DAG.getConstantFP( - APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT); + APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); @@ -13714,29 +14922,31 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); - MVT SVT = N0.getSimpleValueType(); + MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); - if (SVT.getVectorElementType() == MVT::i1) { - if (SVT == MVT::v2i1) + if (SrcVT.getVectorElementType() == MVT::i1) { + if (SrcVT == MVT::v2i1) return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0)); - MVT IntegerVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0)); } - switch (SVT.SimpleTy) { + switch (SrcVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); case MVT::v4i8: case MVT::v4i16: case MVT::v8i8: case MVT::v8i16: { - MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); } + case MVT::v2i32: + return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); case MVT::v4i32: case MVT::v8i32: return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); @@ -13754,15 +14964,15 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (Op.getSimpleValueType().isVector()) - return lowerUINT_TO_FP_vec(Op, DAG); - // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); + if (Op.getSimpleValueType().isVector()) + return lowerUINT_TO_FP_vec(Op, DAG); + MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); @@ -13903,7 +15113,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; - int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); + int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); unsigned Opc; @@ -13935,15 +15145,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // For X87 we'd like to use the smallest FP type for this constant, but // for DAG type consistency we have to match the FP operand type. - APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); + APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000)); LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; bool LosesInfo = false; if (TheVT == MVT::f64) // The rounding mode is irrelevant as the conversion should be exact. - Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, + Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &LosesInfo); else if (TheVT == MVT::f80) - Status = Thresh.convert(APFloat::x87DoubleExtended, + Status = Thresh.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &LosesInfo); assert(Status == APFloat::opOK && !LosesInfo && @@ -13981,7 +15191,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, MachineMemOperand::MOLoad, MemSize, MemSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); - SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); + SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); StackSlot = DAG.getFrameIndex(SSFI, PtrVT); } @@ -14084,14 +15294,14 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); - unsigned int NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) - return SDValue(); + unsigned NumElts = VT.getVectorNumElements(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 && + (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) return DAG.getNode(X86ISD::VZEXT, DL, VT, In); - assert(InVT.getVectorElementType() == MVT::i1); + if (InVT.getVectorElementType() != MVT::i1) + return SDValue(); // Extend VT if the target is 256 or 128bit vector and VLX is not supported. MVT ExtVT = VT; @@ -14137,6 +15347,85 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } +/// Helper to recursively truncate vector elements in half with PACKSS. +/// It makes use of the fact that vector comparison results will be all-zeros +/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types. +/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates +/// within each 128-bit lane. +static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In, + const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Requires SSE2 but AVX512 has fast truncate. + if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) + return SDValue(); + + EVT SrcVT = In.getValueType(); + + // No truncation required, we might get here due to recursive calls. + if (SrcVT == DstVT) + return In; + + // We only support vector truncation to 128bits or greater from a + // 256bits or greater source. + if ((DstVT.getSizeInBits() % 128) != 0) + return SDValue(); + if ((SrcVT.getSizeInBits() % 256) != 0) + return SDValue(); + + unsigned NumElems = SrcVT.getVectorNumElements(); + assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation"); + assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation"); + + EVT PackedSVT = + EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2); + + // Extract lower/upper subvectors. + unsigned NumSubElts = NumElems / 2; + unsigned SrcSizeInBits = SrcVT.getSizeInBits(); + SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2); + SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2); + + // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors. + if (SrcVT.is256BitVector()) { + Lo = DAG.getBitcast(MVT::v8i16, Lo); + Hi = DAG.getBitcast(MVT::v8i16, Hi); + SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi); + return DAG.getBitcast(DstVT, Res); + } + + // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors. + // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS). + if (SrcVT.is512BitVector() && Subtarget.hasInt256()) { + Lo = DAG.getBitcast(MVT::v16i16, Lo); + Hi = DAG.getBitcast(MVT::v16i16, Hi); + SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi); + + // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), + // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). + Res = DAG.getBitcast(MVT::v4i64, Res); + Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3}); + + if (DstVT.is256BitVector()) + return DAG.getBitcast(DstVT, Res); + + // If 512bit -> 128bit truncate another stage. + EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems); + Res = DAG.getBitcast(PackedVT, Res); + return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget); + } + + // Recursively pack lower/upper subvectors, concat result and pack again. + assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater"); + EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2); + Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget); + Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget); + + PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi); + return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget); +} + static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -14203,6 +15492,22 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } + + // Truncate with PACKSS if we are truncating a vector comparison result. + // TODO: We should be able to support other operations as long as we + // we are saturating+packing zero/all bits only. + auto IsPackableComparison = [](SDValue V) { + unsigned Opcode = V.getOpcode(); + return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ || + Opcode == X86ISD::CMPP); + }; + + if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS && + all_of(In->ops(), IsPackableComparison))) { + if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget)) + return V; + } + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { @@ -14299,30 +15604,31 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0, DL)); } -SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, - SelectionDAG &DAG) const { - assert(!Op.getSimpleValueType().isVector()); +SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) const { + bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, - /*IsSigned=*/ true, /*IsReplace=*/ false); - SDValue FIST = Vals.first, StackSlot = Vals.second; - // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (!FIST.getNode()) - return Op; + MVT VT = Op.getSimpleValueType(); - if (StackSlot.getNode()) - // Load the result. - return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, - MachinePointerInfo()); + if (VT.isVector()) { + assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); + SDValue Src = Op.getOperand(0); + SDLoc dl(Op); + if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { + return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, + dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32))); + } - // The node is the result. - return FIST; -} + return SDValue(); + } + + assert(!VT.isVector()); -SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, - SelectionDAG &DAG) const { std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, - /*IsSigned=*/ false, /*IsReplace=*/ false); + IsSigned, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. if (!FIST.getNode()) @@ -14330,8 +15636,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, if (StackSlot.getNode()) // Load the result. - return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, - MachinePointerInfo()); + return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo()); // The node is the result. return FIST; @@ -14376,17 +15681,14 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { MVT LogicVT; MVT EltVT; - unsigned NumElts; if (VT.isVector()) { LogicVT = VT; EltVT = VT.getVectorElementType(); - NumElts = VT.getVectorNumElements(); } else if (IsF128) { // SSE instructions are used for optimized f128 logical operations. LogicVT = MVT::f128; EltVT = VT; - NumElts = 1; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. @@ -14394,22 +15696,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { // the logic op, so it can save (~4 bytes) on code size. LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; EltVT = VT; - NumElts = (VT == MVT::f64) ? 2 : 4; } unsigned EltBits = EltVT.getSizeInBits(); - LLVMContext *Context = DAG.getContext(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); - Constant *C = ConstantInt::get(*Context, MaskElt); - C = ConstantVector::getSplat(NumElts, C); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); - unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad( - LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); + const fltSemantics &Sem = + EltVT == MVT::f64 ? APFloat::IEEEdouble() : + (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle()); + SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); @@ -14429,92 +15725,73 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - LLVMContext *Context = DAG.getContext(); - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); + SDValue Mag = Op.getOperand(0); + SDValue Sign = Op.getOperand(1); SDLoc dl(Op); + + // If the sign operand is smaller, extend it first. MVT VT = Op.getSimpleValueType(); - MVT SrcVT = Op1.getSimpleValueType(); - bool IsF128 = (VT == MVT::f128); + if (Sign.getSimpleValueType().bitsLT(VT)) + Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign); - // If second operand is smaller, extend it first. - if (SrcVT.bitsLT(VT)) { - Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); - SrcVT = VT; - } // And if it is bigger, shrink it first. - if (SrcVT.bitsGT(VT)) { - Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl)); - SrcVT = VT; - } + if (Sign.getSimpleValueType().bitsGT(VT)) + Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl)); // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. - assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + bool IsF128 = (VT == MVT::f128); + assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || + VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || + VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"); + MVT EltVT = VT.getScalarType(); const fltSemantics &Sem = - VT == MVT::f64 ? APFloat::IEEEdouble : - (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); - const unsigned SizeInBits = VT.getSizeInBits(); + EltVT == MVT::f64 ? APFloat::IEEEdouble() + : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle()); + + // Perform all scalar logic operations as 16-byte vectors because there are no + // scalar FP logic instructions in SSE. + // TODO: This isn't necessary. If we used scalar types, we might avoid some + // unnecessary splats, but we might miss load folding opportunities. Should + // this decision be based on OptimizeForSize? + bool IsFakeVector = !VT.isVector() && !IsF128; + MVT LogicVT = VT; + if (IsFakeVector) + LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; - SmallVector<Constant *, 4> CV( - VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), - ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); + // The mask constants are automatically splatted for vector types. + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + SDValue SignMask = DAG.getConstantFP( + APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT); + SDValue MagMask = DAG.getConstantFP( + APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT); // First, clear all bits but the sign bit from the second operand (sign). - CV[0] = ConstantFP::get(*Context, - APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); - Constant *C = ConstantVector::get(CV); - auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); - SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16); - - // Perform all logic operations as 16-byte vectors because there are no - // scalar FP logic instructions in SSE. This allows load folding of the - // constants into the logic instructions. - MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); - SDValue Mask1 = - DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - /* Alignment = */ 16); - if (!IsF128) - Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); - SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); + if (IsFakeVector) + Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign); + SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask); // Next, clear the sign bit from the first operand (magnitude). - // If it's a constant, we can clear it here. - if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) { + // TODO: If we had general constant folding for FP logic ops, this check + // wouldn't be necessary. + SDValue MagBits; + if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) { APFloat APF = Op0CN->getValueAPF(); - // If the magnitude is a positive zero, the sign bit alone is enough. - if (APF.isPosZero()) - return IsF128 ? SignBit : - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, - DAG.getIntPtrConstant(0, dl)); APF.clearSign(); - CV[0] = ConstantFP::get(*Context, APF); + MagBits = DAG.getConstantFP(APF, dl, LogicVT); } else { - CV[0] = ConstantFP::get( - *Context, - APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); - } - C = ConstantVector::get(CV); - CPIdx = DAG.getConstantPool(C, PtrVT, 16); - SDValue Val = - DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - /* Alignment = */ 16); - // If the magnitude operand wasn't a constant, we need to AND out the sign. - if (!isa<ConstantFPSDNode>(Op0)) { - if (!IsF128) - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); - Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); + // If the magnitude operand wasn't a constant, we need to AND out the sign. + if (IsFakeVector) + Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag); + MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask); } + // OR the magnitude value with the sign bit. - Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); - return IsF128 ? Val : - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, - DAG.getIntPtrConstant(0, dl)); + SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit); + return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { @@ -14741,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, } } + // Sometimes flags can be set either with an AND or with an SRL/SHL + // instruction. SRL/SHL variant should be preferred for masks longer than this + // number of bits. + const int ShiftToAndMaxMaskWidth = 32; + const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE); + // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. @@ -14764,7 +16047,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, goto default_case; if (ConstantSDNode *C = - dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { + dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) { // An add of one will be selected as an INC. if (C->isOne() && !Subtarget.slowIncDec()) { Opcode = X86ISD::INC; @@ -14789,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. - if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && + if (ZeroCheck && Op->hasOneUse() && isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { EVT VT = Op.getValueType(); unsigned BitWidth = VT.getSizeInBits(); @@ -14799,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, APInt Mask = ArithOp.getOpcode() == ISD::SRL ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); - if (!Mask.isSignedIntN(32)) // Avoid large immediates. + if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) break; Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), DAG.getConstant(Mask, dl, VT)); @@ -14808,20 +16091,61 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::AND: // If the primary 'and' result isn't used, don't bother using X86ISD::AND, - // because a TEST instruction will be better. + // because a TEST instruction will be better. However, AND should be + // preferred if the instruction can be combined into ANDN. if (!hasNonFlagsUse(Op)) { SDValue Op0 = ArithOp->getOperand(0); SDValue Op1 = ArithOp->getOperand(1); EVT VT = ArithOp.getValueType(); bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1); bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64; + bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI(); + + // If we cannot select an ANDN instruction, check if we can replace + // AND+IMM64 with a shift before giving up. This is possible for masks + // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag. + if (!isProperAndn) { + if (!ZeroCheck) + break; + + assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized"); + auto *CN = dyn_cast<ConstantSDNode>(Op1); + if (!CN) + break; + + const APInt &Mask = CN->getAPIntValue(); + if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth)) + break; // Prefer TEST instruction. + + unsigned BitWidth = Mask.getBitWidth(); + unsigned LeadingOnes = Mask.countLeadingOnes(); + unsigned TrailingZeros = Mask.countTrailingZeros(); + + if (LeadingOnes + TrailingZeros == BitWidth) { + assert(TrailingZeros < VT.getSizeInBits() && + "Shift amount should be less than the type width"); + MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); + SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy); + Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt); + break; + } + + unsigned LeadingZeros = Mask.countLeadingZeros(); + unsigned TrailingOnes = Mask.countTrailingOnes(); + + if (LeadingZeros + TrailingOnes == BitWidth) { + assert(LeadingZeros < VT.getSizeInBits() && + "Shift amount should be less than the type width"); + MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT); + SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy); + Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt); + break; + } - // But if we can combine this into an ANDN operation, then create an AND - // now and allow it to be pattern matched into an ANDN. - if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType) break; + } } - // FALL THROUGH + LLVM_FALLTHROUGH; case ISD::SUB: case ISD::OR: case ISD::XOR: @@ -14839,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: { - if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + if (!NeedTruncation && ZeroCheck) { if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG)) return EFLAGS; } @@ -14968,14 +16292,27 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// Check if replacement of SQRT with RSQRT should be disabled. +bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // We never want to use both SQRT and RSQRT instructions for the same input. + if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) + return false; + + if (VT.isVector()) + return Subtarget.hasFastVectorFSQRT(); + return Subtarget.hasFastScalarFSQRT(); +} + /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). -SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const { +SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, + SelectionDAG &DAG, int Enabled, + int &RefinementSteps, + bool &UseOneConstNR, + bool Reciprocal) const { EVT VT = Op.getValueType(); - const char *RecipOp; // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // TODO: Add support for AVX512 (v16f32). @@ -14984,30 +16321,24 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if (VT == MVT::f32 && Subtarget.hasSSE1()) - RecipOp = "sqrtf"; - else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) || - (VT == MVT::v8f32 && Subtarget.hasAVX())) - RecipOp = "vec-sqrtf"; - else - return SDValue(); - - TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; - if (!Recips.isEnabled(RecipOp)) - return SDValue(); + if ((VT == MVT::f32 && Subtarget.hasSSE1()) || + (VT == MVT::v4f32 && Subtarget.hasSSE1()) || + (VT == MVT::v8f32 && Subtarget.hasAVX())) { + if (RefinementSteps == ReciprocalEstimate::Unspecified) + RefinementSteps = 1; - RefinementSteps = Recips.getRefinementSteps(RecipOp); - UseOneConstNR = false; - return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + UseOneConstNR = false; + return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). -SDValue X86TargetLowering::getRecipEstimate(SDValue Op, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const { +SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, + int Enabled, + int &RefinementSteps) const { EVT VT = Op.getValueType(); - const char *RecipOp; // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // TODO: Add support for AVX512 (v16f32). @@ -15016,20 +16347,22 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. - if (VT == MVT::f32 && Subtarget.hasSSE1()) - RecipOp = "divf"; - else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) || - (VT == MVT::v8f32 && Subtarget.hasAVX())) - RecipOp = "vec-divf"; - else - return SDValue(); - TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; - if (!Recips.isEnabled(RecipOp)) - return SDValue(); + if ((VT == MVT::f32 && Subtarget.hasSSE1()) || + (VT == MVT::v4f32 && Subtarget.hasSSE1()) || + (VT == MVT::v8f32 && Subtarget.hasAVX())) { + // Enable estimate codegen with 1 refinement step for vector division. + // Scalar division estimates are disabled because they break too much + // real-world code. These defaults are intended to match GCC behavior. + if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) + return SDValue(); + + if (RefinementSteps == ReciprocalEstimate::Unspecified) + RefinementSteps = 1; - RefinementSteps = Recips.getRefinementSteps(RecipOp); - return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + } + return SDValue(); } /// If we have at least two divisions that use the same divisor, convert to @@ -15042,9 +16375,46 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } +/// Helper for creating a X86ISD::SETCC node. +static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, + SelectionDAG &DAG) { + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(Cond, dl, MVT::i8), EFLAGS); +} + +/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition +/// according to equal/not-equal condition code \p CC. +static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG) { + // If Src is i8, promote it to i32 with any_extend. There is no i8 BT + // instruction. Since the shift amount is in-range-or-undefined, we know + // that doing a bittest on the i32 value is ok. We extend to i32 because + // the encoding for the i16 version is larger than the i32 version. + // Also promote i16 to i32 for performance / code size reason. + if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16) + Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src); + + // See if we can use the 32-bit instruction instead of the 64-bit one for a + // shorter encoding. Since the former takes the modulo 32 of BitNo and the + // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is + // known to be zero. + if (Src.getValueType() == MVT::i64 && + DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) + Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); + + // If the operand types disagree, extend the shift amount to match. Since + // BT ignores high bits (like shifts) we can use anyextend. + if (Src.getValueType() != BitNo.getValueType()) + BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo); + + SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo); + X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + return getSETCC(Cond, BT, dl , DAG); +} + /// Result of 'and' is compared against zero. Change to a BT node if possible. -SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, - const SDLoc &dl, SelectionDAG &DAG) const { +static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG) { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) @@ -15087,27 +16457,35 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, } } - if (LHS.getNode()) { - // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT - // instruction. Since the shift amount is in-range-or-undefined, we know - // that doing a bittest on the i32 value is ok. We extend to i32 because - // the encoding for the i16 version is larger than the i32 version. - // Also promote i16 to i32 for performance / code size reason. - if (LHS.getValueType() == MVT::i8 || - LHS.getValueType() == MVT::i16) - LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); + if (LHS.getNode()) + return getBitTestCondition(LHS, RHS, CC, dl, DAG); - // If the operand types disagree, extend the shift amount to match. Since - // BT ignores high bits (like shifts) we can use anyextend. - if (LHS.getValueType() != RHS.getValueType()) - RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); + return SDValue(); +} - SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); - X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(Cond, dl, MVT::i8), BT); - } +// Convert (truncate (srl X, N) to i1) to (bt X, N) +static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG) { + + assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 && + "Expected TRUNCATE to i1 node"); + if (Op.getOperand(0).getOpcode() != ISD::SRL) + return SDValue(); + + SDValue ShiftRight = Op.getOperand(0); + return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1), + CC, dl, DAG); +} + +/// Result of 'and' or 'trunc to i1' is compared against zero. +/// Change to a BT node if possible. +SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG) const { + if (Op.getOpcode() == ISD::AND) + return LowerAndToBT(Op, CC, dl, DAG); + if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1) + return LowerTruncateToBT(Op, CC, dl, DAG); return SDValue(); } @@ -15132,19 +16510,19 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: - case ISD::SETGT: Swap = true; // Fallthrough + case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLT: case ISD::SETOLT: SSECC = 1; break; case ISD::SETOGE: - case ISD::SETGE: Swap = true; // Fallthrough + case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLE: case ISD::SETOLE: SSECC = 2; break; case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; - case ISD::SETULE: Swap = true; // Fallthrough + case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: SSECC = 5; break; - case ISD::SETULT: Swap = true; // Fallthrough + case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; case ISD::SETUEQ: @@ -15250,12 +16628,12 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { case ISD::SETNE: SSECC = 4; break; case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; case ISD::SETUGT: SSECC = 6; Unsigned = true; break; - case ISD::SETLT: Swap = true; //fall-through + case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; case ISD::SETULT: SSECC = 1; Unsigned = true; break; case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap - case ISD::SETULE: Unsigned = true; //fall-through + case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH; case ISD::SETLE: SSECC = 2; break; } @@ -15414,7 +16792,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // In this case use SSE compare bool UseAVX512Inst = (OpVT.is512BitVector() || - OpVT.getVectorElementType().getSizeInBits() >= 32 || + OpVT.getScalarSizeInBits() >= 32 || (Subtarget.hasBWI() && Subtarget.hasVLX())); if (UseAVX512Inst) @@ -15638,15 +17016,12 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). - if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - isNullConstant(Op1) && + // Lower (trunc (X >> N) to i1) to BT(X, N). + if (Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { - if (VT == MVT::i1) { - NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC, - DAG.getValueType(MVT::i1)); + if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); - } return NewSetCC; } } @@ -15665,14 +17040,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return Op0; CCode = X86::GetOppositeBranchCondition(CCode); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(CCode, dl, MVT::i8), - Op0.getOperand(1)); - if (VT == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); + SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG); + if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); - } return SetCC; } } @@ -15687,20 +17057,16 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } } - bool isFP = Op1.getSimpleValueType().isFloatingPoint(); - unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG); + bool IsFP = Op1.getSimpleValueType().isFloatingPoint(); + X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) return SDValue(); SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS); - if (VT == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); + SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG); + if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); - } return SetCC; } @@ -15717,34 +17083,23 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { assert(Carry.getOpcode() != ISD::CARRY_FALSE); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); - if (Op.getSimpleValueType() == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); + SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG); + if (Op.getSimpleValueType() == MVT::i1) return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); - } return SetCC; } /// Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { - unsigned Opc = Op.getNode()->getOpcode(); + unsigned Opc = Op.getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || Opc == X86ISD::SAHF) return true; if (Op.getResNo() == 1 && - (Opc == X86ISD::ADD || - Opc == X86ISD::SUB || - Opc == X86ISD::ADC || - Opc == X86ISD::SBB || - Opc == X86ISD::SMUL || - Opc == X86ISD::UMUL || - Opc == X86ISD::INC || - Opc == X86ISD::DEC || - Opc == X86ISD::OR || - Opc == X86ISD::XOR || - Opc == X86ISD::AND)) + (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || + Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || + Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR || + Opc == X86ISD::XOR || Opc == X86ISD::AND)) return true; if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) @@ -15753,27 +17108,18 @@ static bool isX86LogicalCmp(SDValue Op) { return false; } -/// Returns the "condition" node, that may be wrapped with "truncate". -/// Like this: (i1 (trunc (i8 X86ISD::SETCC))). -static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { +static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) - return V; + return false; SDValue VOp0 = V.getOperand(0); - if (VOp0.getOpcode() == ISD::AssertZext && - V.getValueSizeInBits() == - cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits()) - return VOp0.getOperand(0); - unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); - if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits))) - return V.getOperand(0); - return V; + return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - bool addTest = true; + bool AddTest = true; SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); @@ -15794,9 +17140,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SSECC != 8) { if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, - DAG.getConstant(SSECC, DL, MVT::i8)); - return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); + SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0, + CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); + return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS, + DL, VT, Cmp, Op1, Op2); } SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, @@ -15840,6 +17187,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } + // AVX512 fallback is to lower selects of scalar floats to masked moves. + if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) && + Subtarget.hasAVX512()) + return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2); + if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) @@ -15875,8 +17227,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (Cond.getOpcode() == ISD::SETCC) { - if (SDValue NewCond = LowerSETCC(Cond, DAG)) + if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; + // If the condition was updated, it's possible that the operands of the + // select were also updated (for example, EmitTest has a RAUW). Refresh + // the local references to the select operands in case they got stale. + Op1 = Op.getOperand(1); + Op2 = Op.getOperand(2); + } } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y @@ -15953,7 +17311,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME Cond = Cmp; - addTest = false; + AddTest = false; } } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || @@ -15987,12 +17345,13 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond = X86Op.getValue(1); CC = DAG.getConstant(X86Cond, DL, MVT::i8); - addTest = false; + AddTest = false; } - if (addTest) { + if (AddTest) { // Look past the truncate if the high bits are known zero. - Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -16000,12 +17359,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); - addTest = false; + AddTest = false; } } } - if (addTest) { + if (AddTest) { CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); } @@ -16077,34 +17436,44 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); - unsigned int NumElts = VT.getVectorNumElements(); - - if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI()) - return SDValue(); + unsigned NumElts = VT.getVectorNumElements(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { + if (VT.is512BitVector() && InVTElt != MVT::i1 && + (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); } - assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); - MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; - SDValue NegOne = - DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, - ExtVT); - SDValue Zero = - DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); + if (InVTElt != MVT::i1) + return SDValue(); + + MVT ExtVT = VT; + if (!VT.is512BitVector() && !Subtarget.hasVLX()) + ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + + SDValue V; + if (Subtarget.hasDQI()) { + V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In); + assert(!VT.is512BitVector() && "Unexpected vector type"); + } else { + SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl); + SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); + V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); + if (ExtVT == VT) + return V; + } - SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); - if (VT.is512BitVector()) - return V; return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); } -static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. +// For sign extend this needs to handle all vector sizes and SSE4.1 and +// non-SSE4.1 targets. For zero extend this should only handle inputs of +// MVT::v64i8 when BWI is not supported, but AVX512 is. +static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); @@ -16119,20 +17488,33 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && - !(VT.is256BitVector() && Subtarget.hasInt256())) + !(VT.is256BitVector() && Subtarget.hasInt256()) && + !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); // For 256-bit vectors, we only need the lower (128-bit) half of the input. - if (VT.is256BitVector()) - In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, - MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2), - In, DAG.getIntPtrConstant(0, dl)); + // For 512-bit vectors, we need 128-bits or 256-bits. + if (VT.getSizeInBits() > 128) { + // Input needs to be at least the same number of elements as output, and + // at least 128-bits. + int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements(); + In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); + } + + assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || + InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"); // SSE41 targets can use the pmovsx* instructions directly. + unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? + X86ISD::VSEXT : X86ISD::VZEXT; if (Subtarget.hasSSE41()) - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return DAG.getNode(ExtOpc, dl, VT, In); + + // We should only get here for sign extend. + assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && + "Unexpected opcode!"); // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; @@ -16150,7 +17532,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SDValue SignExt = Curr; if (CurrVT != InVT) { unsigned SignExtShift = - CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits(); + CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, DAG.getConstant(SignExtShift, dl, MVT::i8)); } @@ -16211,7 +17593,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2); MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements()/2); + VT.getVectorNumElements() / 2); OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); @@ -16643,7 +18025,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { case X86::COND_B: // These can only come from an arithmetic instruction with overflow, // e.g. SADDO, UADDO. - Cond = Cond.getNode()->getOperand(1); + Cond = Cond.getOperand(1); addTest = false; break; } @@ -16828,11 +18210,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { // Look pass the truncate if the high bits are known zero. - Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); - // We know the result of AND is compared against zero. Try to match - // it to BT. - if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + // We know the result is compared against zero. Try to match it to BT. + if (Cond.hasOneUse()) { if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); @@ -17000,7 +18382,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); - assert(Op.getNode()->getNumOperands() == 4); + assert(Op.getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) @@ -17161,6 +18543,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, /// constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT SVT = ShAmt.getSimpleValueType(); assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); @@ -17178,27 +18561,32 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; } - const X86Subtarget &Subtarget = - static_cast<const X86Subtarget &>(DAG.getSubtarget()); - if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && - ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { - // Let the shuffle legalizer expand this shift amount node. - SDValue Op0 = ShAmt.getOperand(0); - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); - ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG); + // Need to build a vector containing shift amount. + // SSE/AVX packed shifts only use the lower 64-bit of the shift count. + // +=================+============+=======================================+ + // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as | + // +=================+============+=======================================+ + // | i64 | Yes, No | Use ShAmt as lowest elt | + // | i32 | Yes | zero-extend in-reg | + // | (i32 zext(i16)) | Yes | zero-extend in-reg | + // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) | + // +=================+============+=======================================+ + + if (SVT == MVT::i64) + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt); + else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && + ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { + ShAmt = ShAmt.getOperand(0); + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt); + ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + } else if (Subtarget.hasSSE41() && + ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); + ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); } else { - // Need to build a vector containing shift amount. - // SSE/AVX packed shifts only use the lower 64-bit of the shift count. - SmallVector<SDValue, 4> ShOps; - ShOps.push_back(ShAmt); - if (SVT == MVT::i32) { - ShOps.push_back(DAG.getConstant(0, dl, SVT)); - ShOps.push_back(DAG.getUNDEF(SVT)); - } - ShOps.push_back(DAG.getUNDEF(SVT)); - - MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; - ShAmt = DAG.getBuildVector(BVT, dl, ShOps); + SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), + DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; + ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps); } // The return type has to be a 128-bit type with the same element @@ -17290,7 +18678,7 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: - case ISD::FP_TO_FP16: + case X86ISD::CVTPS2PH: // We can't use ISD::VSELECT here because it is not always "Legal" // for the destination type. For example vpmovqb require only AVX512 // and vselect that can operate on byte element type require BWI @@ -17321,7 +18709,8 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, // The mask should be of type MVT::i1 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); - if (Op.getOpcode() == X86ISD::FSETCC) + if (Op.getOpcode() == X86ISD::FSETCCM || + Op.getOpcode() == X86ISD::FSETCCM_RND) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); if (Op.getOpcode() == X86ISD::VFPCLASS || Op.getOpcode() == X86ISD::VFPCLASSS) @@ -17329,7 +18718,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); + return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { @@ -17395,6 +18784,15 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + // Helper to detect if the operand is CUR_DIRECTION rounding mode. + auto isRoundModeCurDirection = [](SDValue Rnd) { + if (!isa<ConstantSDNode>(Rnd)) + return false; + + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + return Round == X86::STATIC_ROUNDING::CUR_DIRECTION; + }; + SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); MVT VT = Op.getSimpleValueType(); @@ -17406,9 +18804,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); - case INTR_TYPE_2OP_IMM8: - return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), - DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -17420,7 +18815,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue RoundingMode; - // We allways add rounding mode to the Node. + // We always add rounding mode to the Node. // If the rounding mode is not specified, we add the // "current direction" mode. if (Op.getNumOperands() == 4) @@ -17428,13 +18823,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); else RoundingMode = Op.getOperand(4); - unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) - if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() != - X86::STATIC_ROUNDING::CUR_DIRECTION) - return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, Op.getValueType(), Src, RoundingMode), - Mask, PassThru, Subtarget, DAG); + assert(IntrData->Opc1 == 0 && "Unexpected second opcode!"); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, RoundingMode), Mask, PassThru, Subtarget, DAG); @@ -17449,8 +18838,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); - unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); - if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + if (!isRoundModeCurDirection(Rnd)) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, Rnd), @@ -17478,8 +18866,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget // (2) With rounding mode and sae - 7 operands. if (Op.getNumOperands() == 6) { SDValue Sae = Op.getOperand(5); - unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0; - return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Sae), Mask, Src0, Subtarget, DAG); } @@ -17506,8 +18893,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(5); - unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); - if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + if (!isRoundModeCurDirection(Rnd)) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Rnd), @@ -17564,12 +18950,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget else Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1, Src2, Imm, Rnd), - Mask, PassThru, Subtarget, DAG); + Src1, Src2, Imm, Rnd), + Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_IMM8_MASK: - case INTR_TYPE_3OP_MASK: - case INSERT_SUBVEC: { + case INTR_TYPE_3OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); @@ -17578,13 +18963,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); - else if (IntrData->Type == INSERT_SUBVEC) { - // imm should be adapted to ISD::INSERT_SUBVECTOR behavior - assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!"); - unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue(); - Imm *= Src2.getSimpleValueType().getVectorNumElements(); - Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32); - } // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, @@ -17592,8 +18970,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(6); - unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); - if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + if (!isRoundModeCurDirection(Rnd)) { return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), @@ -17616,19 +18993,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget } case VPERM_3OP_MASKZ: case VPERM_3OP_MASK:{ + MVT VT = Op.getSimpleValueType(); // Src2 is the PassThru SDValue Src1 = Op.getOperand(1); - SDValue Src2 = Op.getOperand(2); + // PassThru needs to be the same type as the destination in order + // to pattern match correctly. + SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2)); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - MVT VT = Op.getSimpleValueType(); SDValue PassThru = SDValue(); // set PassThru element if (IntrData->Type == VPERM_3OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else - PassThru = DAG.getBitcast(VT, Src2); + PassThru = Src2; // Swap Src1 and Src2 in the node creation return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, @@ -17660,8 +19039,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(5); - if (cast<ConstantSDNode>(Rnd)->getZExtValue() != - X86::STATIC_ROUNDING::CUR_DIRECTION) + if (!isRoundModeCurDirection(Rnd)) return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), @@ -17713,6 +19091,35 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget Src1, Src2, Src3, Src4), Mask, PassThru, Subtarget, DAG); } + case CVTPD2PS: + // ISD::FP_ROUND has a second argument that indicates if the truncation + // does not change the value. Set it to 0 since it can change. + return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), + DAG.getIntPtrConstant(0, dl)); + case CVTPD2PS_MASK: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + // We add rounding mode to the Node when + // - RM Opcode is specified and + // - RM is not "current direction". + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(4); + if (!isRoundModeCurDirection(Rnd)) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!"); + // ISD::FP_ROUND has a second argument that indicates if the truncation + // does not change the value. Set it to 0 since it can change. + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, + DAG.getIntPtrConstant(0, dl)), + Mask, PassThru, Subtarget, DAG); + } case FPCLASS: { // FPclass intrinsics with mask SDValue Src1 = Op.getOperand(1); @@ -17738,7 +19145,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask); } case CMP_MASK: case CMP_MASK_CC: { @@ -17765,8 +19172,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); - if (cast<ConstantSDNode>(Rnd)->getZExtValue() != - X86::STATIC_ROUNDING::CUR_DIRECTION) + if (!isRoundModeCurDirection(Rnd)) Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC, Rnd); } @@ -17798,8 +19204,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Cmp; if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); - if (cast<ConstantSDNode>(Rnd)->getZExtValue() != - X86::STATIC_ROUNDING::CUR_DIRECTION) + if (!isRoundModeCurDirection(Rnd)) Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); } //default rounding mode @@ -17822,39 +19227,29 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue SetCC; switch (CC) { case ISD::SETEQ: { // (ZF = 0 and PF = 0) - SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi); - SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_NP, dl, MVT::i8), - Comi); + SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); + SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP); break; } case ISD::SETNE: { // (ZF = 1 or PF = 1) - SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi); - SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_P, dl, MVT::i8), - Comi); + SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG); + SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG); SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP); break; } case ISD::SETGT: // (CF = 0 and ZF = 0) - SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi); + SetCC = getSETCC(X86::COND_A, Comi, dl, DAG); break; case ISD::SETLT: { // The condition is opposite to GT. Swap the operands. - SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi); + SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG); break; } case ISD::SETGE: // CF = 0 - SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi); + SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG); break; case ISD::SETLE: // The condition is opposite to GE. Swap the operands. - SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi); + SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG); break; default: llvm_unreachable("Unexpected illegal condition!"); @@ -17868,19 +19263,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Sae = Op.getOperand(4); SDValue FCmp; - if (cast<ConstantSDNode>(Sae)->getZExtValue() == - X86::STATIC_ROUNDING::CUR_DIRECTION) - FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS, + if (isRoundModeCurDirection(Sae)) + FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8)); else - FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS, + FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8), Sae); // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg" return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); + Op.getOperand(1), Op.getOperand(2), Subtarget, + DAG); case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); @@ -18027,14 +19422,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { bool IsTestPacked = false; - unsigned X86CC; + X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: - IsTestPacked = true; // Fallthrough + IsTestPacked = true; + LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_avx_ptestz_256: // ZF = 1 @@ -18044,7 +19440,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestc_pd_256: - IsTestPacked = true; // Fallthrough + IsTestPacked = true; + LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_avx_ptestc_256: // CF = 1 @@ -18054,7 +19451,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestnzc_pd_256: - IsTestPacked = true; // Fallthrough + IsTestPacked = true; + LLVM_FALLTHROUGH; case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestnzc_256: // ZF and CF = 0 @@ -18066,18 +19464,17 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue RHS = Op.getOperand(2); unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); - SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_avx512_kortestz_w: case Intrinsic::x86_avx512_kortestc_w: { - unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; + X86::CondCode X86CC = + (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B; SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); - SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -18092,7 +19489,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget case Intrinsic::x86_sse42_pcmpistriz128: case Intrinsic::x86_sse42_pcmpestriz128: { unsigned Opcode; - unsigned X86CC; + X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: @@ -18139,9 +19536,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, dl, MVT::i8), - SDValue(PCMP.getNode(), 1)); + SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -18267,6 +19662,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, return SDValue(Res, 0); } +/// Handles the lowering of builtin intrinsic that return the value +/// of the extended control register. +static void getExtendedControlRegister(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SmallVectorImpl<SDValue> &Results) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue LO, HI; + + // The ECX register is used to select the index of the XCR register to + // return. + SDValue Chain = + DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2)); + SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain); + Chain = SDValue(N1, 0); + + // Reads the content of XCR and returns it in registers EDX:EAX. + if (Subtarget.is64Bit()) { + LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + Chain = HI.getValue(1); + + if (Subtarget.is64Bit()) { + // Merge the two 32-bit values into a 64-bit one.. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, DL, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + /// Handles the lowering of builtin intrinsics that read performance monitor /// counters (x86_rdpmc). static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL, @@ -18413,6 +19853,33 @@ static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) { return Chain; } +/// Emit Truncating Store with signed or unsigned saturation. +static SDValue +EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val, + SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, + SelectionDAG &DAG) { + + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Undef = DAG.getUNDEF(Ptr.getValueType()); + SDValue Ops[] = { Chain, Val, Ptr, Undef }; + return SignedSat ? + DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) : + DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO); +} + +/// Emit Masked Truncating Store with signed or unsigned saturation. +static SDValue +EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, + SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, + MachineMemOperand *MMO, SelectionDAG &DAG) { + + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Ops[] = { Chain, Ptr, Mask, Val }; + return SignedSat ? + DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) : + DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO); +} + static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -18429,8 +19896,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, IntNo == llvm::Intrinsic::x86_flags_write_u64) { // We need a frame pointer because this will get lowered to a PUSH/POP // sequence. - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - MFI->setHasCopyImplyingStackAdjustment(true); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later // during ExpandISelPseudos in EmitInstrWithCustomInserter. return SDValue(); @@ -18509,13 +19976,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } + // Get Extended Control Register. + case XGETBV: { + SmallVector<SDValue, 2> Results; + getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); + } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_NE, dl, MVT::i8), - InTrans); + + SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG); SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); @@ -18530,9 +20002,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, Op.getOperand(4), GenCF.getValue(1)); SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), Op.getOperand(5), MachinePointerInfo()); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_B, dl, MVT::i8), - Res.getValue(1)); + SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG); SDValue Results[] = { SetCC, Store }; return DAG.getMergeValues(Results, dl); } @@ -18550,11 +20020,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getStore(Chain, dl, DataToCompress, Addr, MemIntr->getMemOperand()); - SDValue Compressed = - getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), - Mask, DAG.getUNDEF(VT), Subtarget, DAG); - return DAG.getStore(Chain, dl, Compressed, Addr, - MemIntr->getMemOperand()); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT, + MemIntr->getMemOperand(), + false /* truncating */, true /* compressing */); } case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: @@ -18567,18 +20038,39 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); - EVT VT = MemIntr->getMemoryVT(); + EVT MemVT = MemIntr->getMemoryVT(); - if (isAllOnesConstant(Mask)) // return just a truncate store - return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT, - MemIntr->getMemOperand()); + uint16_t TruncationOp = IntrData->Opc0; + switch (TruncationOp) { + case X86ISD::VTRUNC: { + if (isAllOnesConstant(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT, + MemIntr->getMemOperand()); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT, - MemIntr->getMemOperand(), true); + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT, + MemIntr->getMemOperand(), true /* truncating */); + } + case X86ISD::VTRUNCUS: + case X86ISD::VTRUNCS: { + bool IsSigned = (TruncationOp == X86ISD::VTRUNCS); + if (isAllOnesConstant(Mask)) + return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT, + MemIntr->getMemOperand(), DAG); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, + VMask, MemVT, MemIntr->getMemOperand(), DAG); + } + default: + llvm_unreachable("Unsupported truncstore intrinsic"); + } } + case EXPAND_FROM_MEM: { SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); @@ -18589,24 +20081,24 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); - SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, - MemIntr->getMemOperand()); + if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load. + return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); + if (X86::isZeroNode(Mask)) + return DAG.getUNDEF(VT); - if (isAllOnesConstant(Mask)) // return just a load - return DataToExpand; - - SDValue Results[] = { - getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), - Mask, PassThru, Subtarget, DAG), Chain}; - return DAG.getMergeValues(Results, dl); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, + MemIntr->getMemOperand(), ISD::NON_EXTLOAD, + true /* expanding */); } } } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - MFI->setReturnAddressIsTaken(true); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + MFI.setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); @@ -18630,14 +20122,20 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, MachinePointerInfo()); } +SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true); + return getReturnAddressFrameIndex(DAG); +} + SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); EVT VT = Op.getValueType(); - MFI->setFrameAddressIsTaken(true); + MFI.setFrameAddressIsTaken(true); if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { // Depth > 0 makes no sense on targets which use Windows unwind codes. It @@ -18647,7 +20145,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { if (!FrameAddrIndex) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); - FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( + FrameAddrIndex = MF.getFrameInfo().CreateFixedObject( SlotSize, /*Offset=*/0, /*IsImmutable=*/false); FuncInfo->setFAIndex(FrameAddrIndex); } @@ -18965,7 +20463,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDLoc DL(Op); // Save FP Control Word to stack slot - int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); + int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); @@ -19083,7 +20581,7 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, SmallVector<SDValue, 64> LUTVec; for (int i = 0; i < NumBytes; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); - SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec); + SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec); // Begin by bitcasting the input to byte vector, then split those bytes // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them. @@ -19444,43 +20942,63 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); + // 32-bit vector types used for MULDQ/MULUDQ. + MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); + + // MULDQ returns the 64-bit result of the signed multiplication of the lower + // 32-bits. We can lower with this if the sign bits stretch that far. + if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 && + DAG.ComputeNumSignBits(B) > 32) { + return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A), + DAG.getBitcast(MulVT, B)); + } + // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // // AloBlo = pmuludq(a, b); // AloBhi = pmuludq(a, Bhi); // AhiBlo = pmuludq(Ahi, b); + // + // Hi = psllqi(AloBhi + AhiBlo, 32); + // return AloBlo + Hi; + APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); + bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask); + bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask); + + APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); + bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask); + bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask); - // AloBhi = psllqi(AloBhi, 32); - // AhiBlo = psllqi(AhiBlo, 32); - // return AloBlo + AloBhi + AhiBlo; + // Bit cast to 32-bit vectors for MULUDQ. + SDValue Alo = DAG.getBitcast(MulVT, A); + SDValue Blo = DAG.getBitcast(MulVT, B); - SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); - SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); + SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); - SDValue AhiBlo = Ahi; - SDValue AloBhi = Bhi; - // Bit cast to 32-bit vectors for MULUDQ - MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : - (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; - A = DAG.getBitcast(MulVT, A); - B = DAG.getBitcast(MulVT, B); - Ahi = DAG.getBitcast(MulVT, Ahi); - Bhi = DAG.getBitcast(MulVT, Bhi); + // Only multiply lo/hi halves that aren't known to be zero. + SDValue AloBlo = Zero; + if (!ALoIsZero && !BLoIsZero) + AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo); - SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); - // After shifting right const values the result may be all-zero. - if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) { - AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); - AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); + SDValue AloBhi = Zero; + if (!ALoIsZero && !BHiIsZero) { + SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); + Bhi = DAG.getBitcast(MulVT, Bhi); + AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi); } - if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) { - AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); - AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); + + SDValue AhiBlo = Zero; + if (!AHiIsZero && !BLoIsZero) { + SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); + Ahi = DAG.getBitcast(MulVT, Ahi); + AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo); } - SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); - return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); + SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo); + Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG); + + return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi); } static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, @@ -19905,7 +21423,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // Special case in 32-bit mode, where i64 is expanded into high and low parts. if (!Subtarget.is64Bit() && !Subtarget.hasXOP() && - (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) { + (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) || + (Subtarget.hasAVX512() && VT == MVT::v8i64))) { // Peek through any splat that was introduced for i64 shift vectorization. int SplatIndex = -1; @@ -20018,7 +21537,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); - return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG); + return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG); } } @@ -20147,7 +21666,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } // If possible, lower this shift as a sequence of two shifts by - // constant plus a MOVSS/MOVSD instead of scalarizing it. + // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it. // Example: // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) // @@ -20167,7 +21686,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2); // See if it is possible to replace this node with a sequence of - // two shifts followed by a MOVSS/MOVSD + // two shifts followed by a MOVSS/MOVSD/PBLEND. if (VT == MVT::v4i32) { // Check if it is legal to use a MOVSS. CanBeSimplified = Amt2 == Amt->getOperand(2) && @@ -20199,21 +21718,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { - // Replace this node with two shifts followed by a MOVSS/MOVSD. + // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND. MVT CastVT = MVT::v4i32; SDValue Splat1 = - DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); + DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); SDValue Splat2 = - DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT); + DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); - if (TargetOpcode == X86ISD::MOVSD) - CastVT = MVT::v2i64; SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1); SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2); - SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, - BitCast1, DAG); - return DAG.getBitcast(VT, Result); + if (TargetOpcode == X86ISD::MOVSD) + return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1, + BitCast2, {0, 1, 6, 7})); + return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1, + BitCast2, {0, 5, 6, 7})); } } @@ -20264,15 +21783,44 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } + // It's worth extending once and using the vXi16/vXi32 shifts for smaller + // types, but without AVX512 the extra overheads to get from vXi8 to vXi32 + // make the existing SSE solution better. + if ((Subtarget.hasInt256() && VT == MVT::v8i16) || + (Subtarget.hasAVX512() && VT == MVT::v16i16) || + (Subtarget.hasAVX512() && VT == MVT::v16i8) || + (Subtarget.hasBWI() && VT == MVT::v32i8)) { + MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32); + MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements()); + unsigned ExtOpc = + Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + R = DAG.getNode(ExtOpc, dl, ExtVT, R); + Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); + } + if (VT == MVT::v16i8 || - (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) { + (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || + (VT == MVT::v64i8 && Subtarget.hasBWI())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { - // On SSE41 targets we make use of the fact that VSELECT lowers - // to PBLENDVB which selects bytes based just on the sign bit. - if (Subtarget.hasSSE41()) { + if (VT.is512BitVector()) { + // On AVX512BW targets we make use of the fact that VSELECT lowers + // to a masked blend which selects bytes based just on the sign bit + // extracted to a mask. + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + V0 = DAG.getBitcast(VT, V0); + V1 = DAG.getBitcast(VT, V1); + Sel = DAG.getBitcast(VT, Sel); + Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel); + return DAG.getBitcast(SelVT, + DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + } else if (Subtarget.hasSSE41()) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); @@ -20372,19 +21920,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } } - // It's worth extending once and using the v8i32 shifts for 16-bit types, but - // the extra overheads to get from v16i8 to v8i32 make the existing SSE - // solution better. - if (Subtarget.hasInt256() && VT == MVT::v8i16) { - MVT ExtVT = MVT::v8i32; - unsigned ExtOpc = - Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - R = DAG.getNode(ExtOpc, dl, ExtVT, R); - Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); - return DAG.getNode(ISD::TRUNCATE, dl, VT, - DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); - } - if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); @@ -20519,7 +22054,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); unsigned BaseOp = 0; - unsigned Cond = 0; + X86::CondCode Cond; SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); @@ -20567,16 +22102,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { MVT::i32); SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); - SDValue SetCC = - DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(X86::COND_O, DL, MVT::i32), - SDValue(Sum.getNode(), 2)); + SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG); - if (N->getValueType(1) == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); + if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); - } + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } } @@ -20585,16 +22115,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); - SDValue SetCC = - DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(Cond, DL, MVT::i32), - SDValue(Sum.getNode(), 1)); + SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG); - if (N->getValueType(1) == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); + if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); - } + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } @@ -20790,9 +22315,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, MVT::i32, cpOut.getValue(2)); - SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), - DAG.getConstant(X86::COND_E, DL, MVT::i8), - EFLAGS); + SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); @@ -20898,8 +22421,9 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // two v2i64 vectors which concatenated are the 4 population counts. We can // then use PACKUSWB to shrink and concatenate them into a v4i32 again. SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); - SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros); - SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros); + SDValue V32 = DAG.getBitcast(VT, V); + SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros); + SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros); // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); @@ -21054,6 +22578,8 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, DAG); } +// Please ensure that any codegen change from LowerVectorCTPOP is reflected in +// updated cost models in X86TTIImpl::getIntrinsicInstrCost. static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -21260,8 +22786,7 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode()); RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, - RHS, AN->getMemOperand(), AN->getOrdering(), - AN->getSynchScope()); + RHS, AN->getMemOperand()); } assert(Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"); @@ -21292,9 +22817,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { cast<AtomicSDNode>(Node)->getMemoryVT(), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2), - cast<AtomicSDNode>(Node)->getMemOperand(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getSynchScope()); + cast<AtomicSDNode>(Node)->getMemOperand()); return Swap.getValue(1); } // Other atomic stores have a simple pattern. @@ -21534,26 +23057,48 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, SDValue Mask = N->getMask(); SDLoc dl(Op); + assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && + "Expanding masked load is supported on AVX-512 target only!"); + + assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && + "Expanding masked load is supported for 32 and 64-bit types only!"); + + // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of + // VLX. These types for exp-loads are handled here. + if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4) + return Op; + assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op."); - assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) || + assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."); // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit - unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); - MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); SDValue Src0 = N->getSrc0(); Src0 = ExtendToType(Src0, WideDataVT, DAG); + + // Mask element has to be i1. + MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); + assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && + "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + + MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + if (MaskEltTy != MVT::i1) + Mask = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask); SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), N->getBasePtr(), Mask, Src0, N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType()); + N->getExtensionType(), + N->isExpandingLoad()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), @@ -21571,10 +23116,20 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SDValue Mask = N->getMask(); SDLoc dl(Op); + assert((!N->isCompressingStore() || Subtarget.hasAVX512()) && + "Expanding masked load is supported on AVX-512 target only!"); + + assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && + "Expanding masked load is supported for 32 and 64-bit types only!"); + + // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX. + if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4) + return Op; + assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op."); - assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) || + assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."); @@ -21583,12 +23138,22 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); - MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + + // Mask element has to be i1. + MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); + assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && + "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + + MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + if (MaskEltTy != MVT::i1) + Mask = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), - N->isTruncatingStore()); + N->isTruncatingStore(), N->isCompressingStore()); } static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, @@ -21734,10 +23299,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); + case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: - return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); - case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); - case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); case ISD::FABS: @@ -21756,6 +23322,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); @@ -21830,7 +23397,7 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N, // In some cases (LowerSINT_TO_FP for example) Res has more result values // than original node, chain should be dropped(last value). for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) - Results.push_back(Res.getValue(I)); + Results.push_back(Res.getValue(I)); } /// Replace a node with an illegal result type with a new node built out of @@ -21851,9 +23418,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, auto InVTSize = InVT.getSizeInBits(); const unsigned RegSize = (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; - assert((!Subtarget.hasAVX512() || RegSize < 512) && - "512-bit vector requires AVX512"); - assert((!Subtarget.hasAVX2() || RegSize < 256) && + assert((Subtarget.hasBWI() || RegSize < 512) && + "512-bit vector requires AVX512BW"); + assert((Subtarget.hasAVX2() || RegSize < 256) && "256-bit vector requires AVX2"); auto ElemVT = InVT.getVectorElementType(); @@ -21888,13 +23455,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); return; } - case ISD::SIGN_EXTEND_INREG: - case ISD::ADDC: - case ISD::ADDE: - case ISD::SUBC: - case ISD::SUBE: - // We don't want to expand or promote these. - return; case ISD::SDIV: case ISD::UDIV: case ISD::SREM: @@ -21909,6 +23469,36 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + if (N->getValueType(0) == MVT::v2i32) { + assert((IsSigned || Subtarget.hasAVX512()) && + "Can only handle signed conversion without AVX512"); + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); + SDValue Src = N->getOperand(0); + if (Src.getValueType() == MVT::v2f64) { + SDValue Idx = DAG.getIntPtrConstant(0, dl); + SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI + : X86ISD::CVTTP2UI, + dl, MVT::v4i32, Src); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); + Results.push_back(Res); + return; + } + if (Src.getValueType() == MVT::v2f32) { + SDValue Idx = DAG.getIntPtrConstant(0, dl); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); + Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT + : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); + Results.push_back(Res); + return; + } + + // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs, + // so early out here. + return; + } + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; @@ -21923,13 +23513,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::SINT_TO_FP: { + assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); + SDValue Src = N->getOperand(0); + if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64) + return; + Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src)); + return; + } case ISD::UINT_TO_FP: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - if (N->getOperand(0).getValueType() != MVT::v2i32 || - N->getValueType(0) != MVT::v2f32) + EVT VT = N->getValueType(0); + if (VT != MVT::v2f32) return; - SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, - N->getOperand(0)); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { + Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src)); + return; + } + if (SrcVT != MVT::v2i32) + return; + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src); SDValue VBias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, @@ -21967,6 +23572,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results); case Intrinsic::x86_rdpmc: return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); + + case Intrinsic::x86_xgetbv: + return getExtendedControlRegister(N, dl, DAG, Subtarget, Results); } } case ISD::INTRINSIC_WO_CHAIN: { @@ -22052,9 +23660,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, MVT::i32, cpOutH.getValue(2)); - SDValue Success = - DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS); + SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG); Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); @@ -22143,6 +23749,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; + case X86ISD::FSETCCM: return "X86ISD::FSETCCM"; + case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -22215,11 +23823,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; + case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES"; + case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; + case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; + case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; + case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; - case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; - case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; + case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; + case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND"; case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; @@ -22332,27 +23946,43 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND"; + case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND"; + case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND"; + case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND"; + case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND"; + case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND"; + case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND"; + case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND"; case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; + case X86ISD::VREDUCES: return "X86ISD::VREDUCES"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; + case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; case X86ISD::EXPAND: return "X86ISD::EXPAND"; case X86ISD::SELECT: return "X86ISD::SELECT"; + case X86ISD::SELECTS: return "X86ISD::SELECTS"; case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; case X86ISD::RCP28: return "X86ISD::RCP28"; + case X86ISD::RCP28S: return "X86ISD::RCP28S"; case X86ISD::EXP2: return "X86ISD::EXP2"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; + case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; + case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; + case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND"; case X86ISD::SCALEF: return "X86ISD::SCALEF"; case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; case X86ISD::ADDS: return "X86ISD::ADDS"; @@ -22361,13 +23991,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MULHRS: return "X86ISD::MULHRS"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; - case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; - case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; + case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI"; + case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; + case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND"; + case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND"; + case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND"; + case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND"; + case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; + case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; - case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND"; - case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND"; + case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; + case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; + case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; + case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; + case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; + case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; + case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND"; + case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND"; + case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; + case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; } return nullptr; } @@ -24031,11 +25675,10 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); - MachineModuleInfo *MMI = &MF->getMMI(); - MachineFrameInfo *MFI = MF->getFrameInfo(); + MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - int FI = MFI->getFunctionContextIndex(); + int FI = MFI.getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're // associated with. @@ -24055,10 +25698,10 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, break; } - if (!MMI->hasCallSiteLandingPad(Sym)) + if (!MF->hasCallSiteLandingPad(Sym)) continue; - for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) { + for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) { CallSiteNumToLPad[CSI].push_back(&MBB); MaxCSNum = std::max(MaxCSNum, CSI); } @@ -24208,173 +25851,18 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, return BB; } -// Replace 213-type (isel default) FMA3 instructions with 231-type for -// accumulator loops. Writing back to the accumulator allows the coalescer -// to remove extra copies in the loop. -// FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937). -MachineBasicBlock * -X86TargetLowering::emitFMA3Instr(MachineInstr &MI, - MachineBasicBlock *MBB) const { - MachineOperand &AddendOp = MI.getOperand(3); - - // Bail out early if the addend isn't a register - we can't switch these. - if (!AddendOp.isReg()) - return MBB; - - MachineFunction &MF = *MBB->getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - // Check whether the addend is defined by a PHI: - assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); - MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); - if (!AddendDef.isPHI()) - return MBB; - - // Look for the following pattern: - // loop: - // %addend = phi [%entry, 0], [%loop, %result] - // ... - // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend - - // Replace with: - // loop: - // %addend = phi [%entry, 0], [%loop, %result] - // ... - // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2 - - for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { - assert(AddendDef.getOperand(i).isReg()); - MachineOperand PHISrcOp = AddendDef.getOperand(i); - MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); - if (&PHISrcInst == &MI) { - // Found a matching instruction. - unsigned NewFMAOpc = 0; - switch (MI.getOpcode()) { - case X86::VFMADDPDr213r: - NewFMAOpc = X86::VFMADDPDr231r; - break; - case X86::VFMADDPSr213r: - NewFMAOpc = X86::VFMADDPSr231r; - break; - case X86::VFMADDSDr213r: - NewFMAOpc = X86::VFMADDSDr231r; - break; - case X86::VFMADDSSr213r: - NewFMAOpc = X86::VFMADDSSr231r; - break; - case X86::VFMSUBPDr213r: - NewFMAOpc = X86::VFMSUBPDr231r; - break; - case X86::VFMSUBPSr213r: - NewFMAOpc = X86::VFMSUBPSr231r; - break; - case X86::VFMSUBSDr213r: - NewFMAOpc = X86::VFMSUBSDr231r; - break; - case X86::VFMSUBSSr213r: - NewFMAOpc = X86::VFMSUBSSr231r; - break; - case X86::VFNMADDPDr213r: - NewFMAOpc = X86::VFNMADDPDr231r; - break; - case X86::VFNMADDPSr213r: - NewFMAOpc = X86::VFNMADDPSr231r; - break; - case X86::VFNMADDSDr213r: - NewFMAOpc = X86::VFNMADDSDr231r; - break; - case X86::VFNMADDSSr213r: - NewFMAOpc = X86::VFNMADDSSr231r; - break; - case X86::VFNMSUBPDr213r: - NewFMAOpc = X86::VFNMSUBPDr231r; - break; - case X86::VFNMSUBPSr213r: - NewFMAOpc = X86::VFNMSUBPSr231r; - break; - case X86::VFNMSUBSDr213r: - NewFMAOpc = X86::VFNMSUBSDr231r; - break; - case X86::VFNMSUBSSr213r: - NewFMAOpc = X86::VFNMSUBSSr231r; - break; - case X86::VFMADDSUBPDr213r: - NewFMAOpc = X86::VFMADDSUBPDr231r; - break; - case X86::VFMADDSUBPSr213r: - NewFMAOpc = X86::VFMADDSUBPSr231r; - break; - case X86::VFMSUBADDPDr213r: - NewFMAOpc = X86::VFMSUBADDPDr231r; - break; - case X86::VFMSUBADDPSr213r: - NewFMAOpc = X86::VFMSUBADDPSr231r; - break; - - case X86::VFMADDPDr213rY: - NewFMAOpc = X86::VFMADDPDr231rY; - break; - case X86::VFMADDPSr213rY: - NewFMAOpc = X86::VFMADDPSr231rY; - break; - case X86::VFMSUBPDr213rY: - NewFMAOpc = X86::VFMSUBPDr231rY; - break; - case X86::VFMSUBPSr213rY: - NewFMAOpc = X86::VFMSUBPSr231rY; - break; - case X86::VFNMADDPDr213rY: - NewFMAOpc = X86::VFNMADDPDr231rY; - break; - case X86::VFNMADDPSr213rY: - NewFMAOpc = X86::VFNMADDPSr231rY; - break; - case X86::VFNMSUBPDr213rY: - NewFMAOpc = X86::VFNMSUBPDr231rY; - break; - case X86::VFNMSUBPSr213rY: - NewFMAOpc = X86::VFNMSUBPSr231rY; - break; - case X86::VFMADDSUBPDr213rY: - NewFMAOpc = X86::VFMADDSUBPDr231rY; - break; - case X86::VFMADDSUBPSr213rY: - NewFMAOpc = X86::VFMADDSUBPSr231rY; - break; - case X86::VFMSUBADDPDr213rY: - NewFMAOpc = X86::VFMSUBADDPDr231rY; - break; - case X86::VFMSUBADDPSr213rY: - NewFMAOpc = X86::VFMSUBADDPSr231rY; - break; - default: - llvm_unreachable("Unrecognized FMA variant."); - } - - const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - MachineInstrBuilder MIB = - BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc)) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(3)) - .addOperand(MI.getOperand(2)) - .addOperand(MI.getOperand(1)); - MBB->insert(MachineBasicBlock::iterator(MI), MIB); - MI.eraseFromParent(); - } - } - - return MBB; -} - MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: - case X86::TAILJMPd64_REX: case X86::TAILJMPr64_REX: case X86::TAILJMPm64_REX: llvm_unreachable("TAILJMP64 would not be touched here."); @@ -24423,8 +25911,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::RDFLAGS32: case X86::RDFLAGS64: { - DebugLoc DL = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); unsigned PushF = MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; @@ -24442,8 +25928,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::WRFLAGS32: case X86::WRFLAGS64: { - DebugLoc DL = MI.getDebugLoc(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); unsigned Push = MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; unsigned PopF = @@ -24468,19 +25952,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::FP80_TO_INT16_IN_MEM: case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { - MachineFunction *F = BB->getParent(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. - int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); + int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); // Load the old value of the high byte of the control word... unsigned OldCW = - F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); + MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); @@ -24588,39 +26068,57 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); - case X86::VFMADDPDr213r: - case X86::VFMADDPSr213r: - case X86::VFMADDSDr213r: - case X86::VFMADDSSr213r: - case X86::VFMSUBPDr213r: - case X86::VFMSUBPSr213r: - case X86::VFMSUBSDr213r: - case X86::VFMSUBSSr213r: - case X86::VFNMADDPDr213r: - case X86::VFNMADDPSr213r: - case X86::VFNMADDSDr213r: - case X86::VFNMADDSSr213r: - case X86::VFNMSUBPDr213r: - case X86::VFNMSUBPSr213r: - case X86::VFNMSUBSDr213r: - case X86::VFNMSUBSSr213r: - case X86::VFMADDSUBPDr213r: - case X86::VFMADDSUBPSr213r: - case X86::VFMSUBADDPDr213r: - case X86::VFMSUBADDPSr213r: - case X86::VFMADDPDr213rY: - case X86::VFMADDPSr213rY: - case X86::VFMSUBPDr213rY: - case X86::VFMSUBPSr213rY: - case X86::VFNMADDPDr213rY: - case X86::VFNMADDPSr213rY: - case X86::VFNMSUBPDr213rY: - case X86::VFNMSUBPSr213rY: - case X86::VFMADDSUBPDr213rY: - case X86::VFMADDSUBPSr213rY: - case X86::VFMSUBADDPDr213rY: - case X86::VFMSUBADDPSr213rY: - return emitFMA3Instr(MI, BB); + case X86::LCMPXCHG8B: { + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B + // requires a memory operand. If it happens that current architecture is + // i686 and for current function we need a base pointer + // - which is ESI for i686 - register allocator would not be able to + // allocate registers for an address in form of X(%reg, %reg, Y) + // - there never would be enough unreserved registers during regalloc + // (without the need for base ptr the only option would be X(%edi, %esi, Y). + // We are giving a hand to register allocator by precomputing the address in + // a new vreg using LEA. + + // If it is not i686 or there is no base pointer - nothing to do here. + if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF)) + return BB; + + // Even though this code does not necessarily needs the base pointer to + // be ESI, we check for that. The reason: if this assert fails, there are + // some changes happened in the compiler base pointer handling, which most + // probably have to be addressed somehow here. + assert(TRI->getBaseRegister() == X86::ESI && + "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " + "base pointer in mind"); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + MVT SPTy = getPointerTy(MF->getDataLayout()); + const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); + unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); + + X86AddressMode AM = getAddressFromInstr(&MI, 0); + // Regalloc does not need any help when the memory operand of CMPXCHG8B + // does not use index register. + if (AM.IndexReg == X86::NoRegister) + return BB; + + // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its + // four operand definitions that are E[ABCD] registers. We skip them and + // then insert the LEA. + MachineBasicBlock::iterator MBBI(MI); + while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) || + MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX)) + --MBBI; + addFullAddress( + BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM); + + setDirectAddressInInstr(&MI, 0, computedAddrVReg); + + return BB; + } + case X86::LCMPXCHG16B: + return BB; case X86::LCMPXCHG8B_SAVE_EBX: case X86::LCMPXCHG16B_SAVE_RBX: { unsigned BasePtr = @@ -24667,7 +26165,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, // These nodes' second result is a boolean. if (Op.getResNo() == 0) break; - // Fallthrough + LLVM_FALLTHROUGH; case X86ISD::SETCC: KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; @@ -24676,16 +26174,36 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); break; } + case X86ISD::VZEXT: { + SDValue N0 = Op.getOperand(0); + unsigned NumElts = Op.getValueType().getVectorNumElements(); + unsigned InNumElts = N0.getValueType().getVectorNumElements(); + unsigned InBitWidth = N0.getValueType().getScalarSizeInBits(); + + KnownZero = KnownOne = APInt(InBitWidth, 0); + APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts); + DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1); + KnownOne = KnownOne.zext(BitWidth); + KnownZero = KnownZero.zext(BitWidth); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth); + break; + } } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( - SDValue Op, - const SelectionDAG &, - unsigned Depth) const { + SDValue Op, const SelectionDAG &DAG, unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) - return Op.getValueType().getScalarSizeInBits(); + return Op.getScalarValueSizeInBits(); + + if (Op.getOpcode() == X86ISD::VSEXT) { + EVT VT = Op.getValueType(); + EVT SrcVT = Op.getOperand(0).getValueType(); + unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits(); + return Tmp; + } // Fallback case. return 1; @@ -24706,171 +26224,113 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// Performs shuffle combines for 256-bit vectors. -/// FIXME: This could be expanded to support 512 bit vectors as well. -static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - SDLoc dl(N); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - MVT VT = SVOp->getSimpleValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - if (V1.getOpcode() == ISD::CONCAT_VECTORS && - V2.getOpcode() == ISD::CONCAT_VECTORS) { - // - // 0,0,0,... - // | - // V UNDEF BUILD_VECTOR UNDEF - // \ / \ / - // CONCAT_VECTOR CONCAT_VECTOR - // \ / - // \ / - // RESULT: V + zero extended - // - if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || - !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef()) - return SDValue(); - - if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) - return SDValue(); - - // To match the shuffle mask, the first half of the mask should - // be exactly the first vector, and all the rest a splat with the - // first element of the second one. - for (unsigned i = 0; i != NumElems/2; ++i) - if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || - !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) - return SDValue(); - - // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. - if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { - if (Ld->hasNUsesOfValue(1, 0)) { - SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); - SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; - SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, - Ld->getMemoryVT(), - Ld->getPointerInfo(), - Ld->getAlignment(), - false/*isVolatile*/, true/*ReadMem*/, - false/*WriteMem*/); - - // Make sure the newly-created LOAD is in the same position as Ld in - // terms of dependency. We create a TokenFactor for Ld and ResNode, - // and update uses of Ld's output chain to use the TokenFactor. - if (Ld->hasAnyUseOfValue(1)) { - SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), - SDValue(ResNode.getNode(), 1)); - } - - return DAG.getBitcast(VT, ResNode); - } - } - - // Emit a zeroed vector and insert the desired subvector on its - // first half. - SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - return SDValue(); -} - // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, +static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, + bool FloatDomain, const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT) { - bool FloatDomain = SrcVT.isFloatingPoint() || - (!Subtarget.hasAVX2() && SrcVT.is256BitVector()); + unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { + unsigned NumMaskElts = Mask.size(); + unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); - // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction. - if (!FloatDomain && SrcVT.is128BitVector() && - isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) { + // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). + if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && + isUndefOrEqual(Mask[0], 0) && + isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { Shuffle = X86ISD::VZEXT_MOVL; - ShuffleVT = MVT::v2i64; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } + // Match against a VZEXT instruction. + // TODO: Add 256/512-bit vector support. + if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) { + unsigned MaxScale = 64 / MaskEltSize; + for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { + bool Match = true; + unsigned NumDstElts = NumMaskElts / Scale; + for (unsigned i = 0; i != NumDstElts && Match; ++i) { + Match &= isUndefOrEqual(Mask[i * Scale], (int)i); + Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); + } + if (Match) { + SrcVT = MaskVT; + DstVT = MVT::getIntegerVT(Scale * MaskEltSize); + DstVT = MVT::getVectorVT(DstVT, NumDstElts); + Shuffle = X86ISD::VZEXT; + return true; + } + } + } + // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. - if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { + if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v2f64; + SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVSLDUP; - ShuffleVT = MVT::v4f32; + SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) { Shuffle = X86ISD::MOVSHDUP; - ShuffleVT = MVT::v4f32; + SrcVT = DstVT = MVT::v4f32; return true; } } - if (SrcVT.is256BitVector() && FloatDomain) { + if (MaskVT.is256BitVector() && FloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v4f64; + SrcVT = DstVT = MVT::v4f64; return true; } if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVSLDUP; - ShuffleVT = MVT::v8f32; + SrcVT = DstVT = MVT::v8f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) { Shuffle = X86ISD::MOVSHDUP; - ShuffleVT = MVT::v8f32; + SrcVT = DstVT = MVT::v8f32; return true; } } - if (SrcVT.is512BitVector() && FloatDomain) { + if (MaskVT.is512BitVector() && FloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v8f64; + SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { Shuffle = X86ISD::MOVSLDUP; - ShuffleVT = MVT::v16f32; + SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { Shuffle = X86ISD::MOVSHDUP; - ShuffleVT = MVT::v16f32; + SrcVT = DstVT = MVT::v16f32; return true; } } // Attempt to match against broadcast-from-vector. if (Subtarget.hasAVX2()) { - unsigned NumElts = Mask.size(); - SmallVector<int, 64> BroadcastMask(NumElts, 0); + SmallVector<int, 64> BroadcastMask(NumMaskElts, 0); if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { - unsigned EltSize = SrcVT.getSizeInBits() / NumElts; - ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize) - : MVT::getIntegerVT(EltSize); - ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts); + SrcVT = DstVT = MaskVT; Shuffle = X86ISD::VBROADCAST; return true; } @@ -24882,19 +26342,44 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, // Attempt to match a combined shuffle mask against supported unary immediate // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, - const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT, - unsigned &PermuteImm) { - // Ensure we don't contain any zero elements. - for (int M : Mask) { - if (M == SM_SentinelZero) - return false; - assert(SM_SentinelUndef <= M && M < (int)Mask.size() && - "Expected unary shuffle"); +static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, + bool FloatDomain, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { + unsigned NumMaskElts = Mask.size(); + + bool ContainsZeros = false; + SmallBitVector Zeroable(NumMaskElts, false); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + Zeroable[i] = isUndefOrZero(M); + ContainsZeros |= (M == SM_SentinelZero); + } + + // Attempt to match against byte/bit shifts. + // FIXME: Add 512-bit support. + if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, + MaskVT.getScalarSizeInBits(), Mask, + 0, Zeroable, Subtarget); + if (0 < ShiftAmt) { + PermuteImm = (unsigned)ShiftAmt; + return true; + } } - unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size(); + // Ensure we don't contain any zero elements. + if (ContainsZeros) + return false; + + assert(llvm::all_of(Mask, [&](int M) { + return SM_SentinelUndef <= M && M < (int)NumMaskElts; + }) && "Expected unary shuffle"); + + unsigned InputSizeInBits = MaskVT.getSizeInBits(); + unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size(); MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); // Handle PSHUFLW/PSHUFHW repeated patterns. @@ -24908,7 +26393,7 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, if (isUndefOrInRange(LoMask, 0, 4) && isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { Shuffle = X86ISD::PSHUFLW; - ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16); + ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); PermuteImm = getV4X86ShuffleImm(LoMask); return true; } @@ -24922,7 +26407,7 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); Shuffle = X86ISD::PSHUFHW; - ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16); + ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); PermuteImm = getV4X86ShuffleImm(OffsetHiMask); return true; } @@ -24938,24 +26423,23 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - bool FloatDomain = SrcVT.isFloatingPoint(); if (FloatDomain && !Subtarget.hasAVX()) return false; // Pre-AVX2 we must use float shuffles on 256-bit vectors. - if (SrcVT.is256BitVector() && !Subtarget.hasAVX2()) + if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) FloatDomain = true; // Check for lane crossing permutes. if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). - if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) { + if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) { Shuffle = X86ISD::VPERMI; ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); PermuteImm = getV4X86ShuffleImm(Mask); return true; } - if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) { + if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) { SmallVector<int, 4> RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { Shuffle = X86ISD::VPERMI; @@ -24994,7 +26478,7 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); - ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); PermuteImm = getV4X86ShuffleImm(WordMask); return true; } @@ -25002,47 +26486,259 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. -static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, - unsigned &Shuffle, MVT &ShuffleVT) { - bool FloatDomain = SrcVT.isFloatingPoint(); +static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, + bool FloatDomain, SDValue &V1, SDValue &V2, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + bool IsUnary) { + unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); - if (SrcVT.is128BitVector()) { + if (MaskVT.is128BitVector()) { if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { + V2 = V1; Shuffle = X86ISD::MOVLHPS; ShuffleVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) { + V2 = V1; Shuffle = X86ISD::MOVHLPS; ShuffleVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) { - Shuffle = X86ISD::UNPCKL; - ShuffleVT = MVT::v4f32; + if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() && + (FloatDomain || !Subtarget.hasSSE41())) { + std::swap(V1, V2); + Shuffle = X86ISD::MOVSD; + ShuffleVT = MaskVT; return true; } - if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) { - Shuffle = X86ISD::UNPCKH; - ShuffleVT = MVT::v4f32; + if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && + (FloatDomain || !Subtarget.hasSSE41())) { + Shuffle = X86ISD::MOVSS; + ShuffleVT = MaskVT; + return true; + } + } + + // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle. + if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || + (MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { + MVT LegalVT = MaskVT; + if (LegalVT.is256BitVector() && !Subtarget.hasAVX2()) + LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); + + SmallVector<int, 64> Unpckl, Unpckh; + if (IsUnary) { + createUnpackShuffleMask(MaskVT, Unpckl, true, true); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + V2 = V1; + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + createUnpackShuffleMask(MaskVT, Unpckh, false, true); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + V2 = V1; + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } + } else { + createUnpackShuffleMask(MaskVT, Unpckl, true, false); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + createUnpackShuffleMask(MaskVT, Unpckh, false, false); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + std::swap(V1, V2); + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + std::swap(V1, V2); + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } + } + } + + return false; +} + +static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, + bool FloatDomain, + SDValue &V1, SDValue &V2, + SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { + unsigned NumMaskElts = Mask.size(); + + // Attempt to match against PALIGNR byte rotate. + if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); + if (0 < ByteRotation) { + Shuffle = X86ISD::PALIGNR; + ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); + PermuteImm = ByteRotation; + return true; + } + } + + // Attempt to combine to X86ISD::BLENDI. + if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || + (Subtarget.hasAVX() && MaskVT.is256BitVector()))) { + // Determine a type compatible with X86ISD::BLENDI. + // TODO - add 16i16 support (requires lane duplication). + MVT BlendVT = MaskVT; + if (Subtarget.hasAVX2()) { + if (BlendVT == MVT::v4i64) + BlendVT = MVT::v8i32; + else if (BlendVT == MVT::v2i64) + BlendVT = MVT::v4i32; + } else { + if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32) + BlendVT = MVT::v8i16; + else if (BlendVT == MVT::v4i64) + BlendVT = MVT::v4f64; + else if (BlendVT == MVT::v8i32) + BlendVT = MVT::v8f32; + } + + unsigned BlendSize = BlendVT.getVectorNumElements(); + unsigned MaskRatio = BlendSize / NumMaskElts; + + // Can we blend with zero? + if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, + /*Low*/ 0) && + NumMaskElts <= BlendVT.getVectorNumElements()) { + PermuteImm = 0; + for (unsigned i = 0; i != BlendSize; ++i) + if (Mask[i / MaskRatio] < 0) + PermuteImm |= 1u << i; + + V2 = getZeroVector(BlendVT, Subtarget, DAG, DL); + Shuffle = X86ISD::BLENDI; + ShuffleVT = BlendVT; return true; } - if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) || - isTargetShuffleEquivalent( - Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) { - Shuffle = X86ISD::UNPCKL; - ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; + + // Attempt to match as a binary blend. + if (NumMaskElts <= BlendVT.getVectorNumElements()) { + bool MatchBlend = true; + for (int i = 0; i != (int)NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + else if (M == SM_SentinelZero) + MatchBlend = false; + else if ((M != i) && (M != (i + (int)NumMaskElts))) + MatchBlend = false; + } + + if (MatchBlend) { + PermuteImm = 0; + for (unsigned i = 0; i != BlendSize; ++i) + if ((int)NumMaskElts <= Mask[i / MaskRatio]) + PermuteImm |= 1u << i; + + Shuffle = X86ISD::BLENDI; + ShuffleVT = BlendVT; + return true; + } + } + } + + // Attempt to combine to INSERTPS. + if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) { + SmallBitVector Zeroable(4, false); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (Mask[i] < 0) + Zeroable[i] = true; + + if (Zeroable.any() && + matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + Shuffle = X86ISD::INSERTPS; + ShuffleVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) || - isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, - 13, 14, 14, 15, 15})) { - Shuffle = X86ISD::UNPCKH; - ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; + } + + // Attempt to combine to SHUFPD. + if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) || + (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) || + (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) { + if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { + Shuffle = X86ISD::SHUFP; + ShuffleVT = MaskVT; return true; } } + // Attempt to combine to SHUFPS. + if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || + (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || + (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) { + SmallVector<int, 4> RepeatedMask; + if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { + auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { + int M0 = RepeatedMask[Offset]; + int M1 = RepeatedMask[Offset + 1]; + + if (isUndefInRange(RepeatedMask, Offset, 2)) { + return DAG.getUNDEF(MaskVT); + } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) { + S0 = (SM_SentinelUndef == M0 ? -1 : 0); + S1 = (SM_SentinelUndef == M1 ? -1 : 1); + return getZeroVector(MaskVT, Subtarget, DAG, DL); + } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) { + S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); + S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); + return V1; + } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) { + S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); + S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); + return V2; + } + + return SDValue(); + }; + + int ShufMask[4] = {-1, -1, -1, -1}; + SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]); + SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]); + + if (Lo && Hi) { + V1 = Lo; + V2 = Hi; + Shuffle = X86ISD::SHUFP; + ShuffleVT = MaskVT; + PermuteImm = getV4X86ShuffleImm(ShufMask); + return true; + } + } + } + return false; } @@ -25055,33 +26751,44 @@ static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. -static bool combineX86ShuffleChain(SDValue Input, SDValue Root, +static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); + assert((Inputs.size() == 1 || Inputs.size() == 2) && + "Unexpected number of shuffle inputs!"); - // Find the operand that enters the chain. Note that multiple uses are OK - // here, we're not going to remove the operand we find. - Input = peekThroughBitcasts(Input); + // Find the inputs that enter the chain. Note that multiple uses are OK + // here, we're not going to remove the operands we find. + bool UnaryShuffle = (Inputs.size() == 1); + SDValue V1 = peekThroughBitcasts(Inputs[0]); + SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1])); - MVT VT = Input.getSimpleValueType(); + MVT VT1 = V1.getSimpleValueType(); + MVT VT2 = V2.getSimpleValueType(); MVT RootVT = Root.getSimpleValueType(); - SDLoc DL(Root); + assert(VT1.getSizeInBits() == RootVT.getSizeInBits() && + VT2.getSizeInBits() == RootVT.getSizeInBits() && + "Vector size mismatch"); + SDLoc DL(Root); SDValue Res; unsigned NumBaseMaskElts = BaseMask.size(); if (NumBaseMaskElts == 1) { assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1), /*AddTo*/ true); return true; } unsigned RootSizeInBits = RootVT.getSizeInBits(); + unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; + bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || + (RootVT.is256BitVector() && !Subtarget.hasAVX2()); // Don't combine if we are a AVX512/EVEX target and the mask element size // is different from the root element size - this would prevent writemasks @@ -25089,26 +26796,25 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, // TODO - this currently prevents all lane shuffles from occurring. // TODO - check for writemasks usage instead of always preventing combining. // TODO - attempt to narrow Mask back to writemask size. - if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits && - (RootSizeInBits == 512 || - (Subtarget.hasVLX() && RootSizeInBits >= 128))) { + bool IsEVEXShuffle = + RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); + if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits)) return false; - } // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. - if (VT.is256BitVector() && NumBaseMaskElts == 2 && + // TODO - this should support binary shuffles. + if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) return false; // Nothing to do! - MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64 - : MVT::v4i64); + MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); unsigned PermMask = 0; PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); - Res = DAG.getBitcast(ShuffleVT, Input); + Res = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, DAG.getUNDEF(ShuffleVT), @@ -25134,144 +26840,234 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts; // Determine the effective mask value type. - bool FloatDomain = - (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) && - (32 <= MaskEltSizeInBits); + FloatDomain &= (32 <= MaskEltSizeInBits); MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits) : MVT::getIntegerVT(MaskEltSizeInBits); MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts); + // Only allow legal mask types. + if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) + return false; + // Attempt to match the mask against known shuffle patterns. - MVT ShuffleVT; + MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; - if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) { - if (Depth == 1 && Root.getOpcode() == Shuffle) - return false; // Nothing to do! - Res = DAG.getBitcast(ShuffleVT, Input); - DCI.AddToWorklist(Res.getNode()); - Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); - DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + if (UnaryShuffle) { + // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load + // directly if we don't shuffle the lower element and we shuffle the upper + // (zero) elements within themselves. + if (V1.getOpcode() == X86ISD::VZEXT_LOAD && + (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) { + unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits; + ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale); + if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && + isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1), + /*AddTo*/ true); + return true; + } + } + + if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle, + ShuffleSrcVT, ShuffleVT)) { + if (Depth == 1 && Root.getOpcode() == Shuffle) + return false; // Nothing to do! + if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) + return false; // AVX512 Writemask clash. + Res = DAG.getBitcast(ShuffleSrcVT, V1); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, + Shuffle, ShuffleVT, PermuteImm)) { + if (Depth == 1 && Root.getOpcode() == Shuffle) + return false; // Nothing to do! + if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) + return false; // AVX512 Writemask clash. + Res = DAG.getBitcast(ShuffleVT, V1); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, + DAG.getConstant(PermuteImm, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } } - if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT, - PermuteImm)) { + if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget, + Shuffle, ShuffleVT, UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! - Res = DAG.getBitcast(ShuffleVT, Input); - DCI.AddToWorklist(Res.getNode()); - Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, - DAG.getConstant(PermuteImm, DL, MVT::i8)); + if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) + return false; // AVX512 Writemask clash. + V1 = DAG.getBitcast(ShuffleVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(ShuffleVT, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2); DCI.AddToWorklist(Res.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), /*AddTo*/ true); return true; } - if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) { + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL, + DAG, Subtarget, Shuffle, ShuffleVT, + PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! - Res = DAG.getBitcast(ShuffleVT, Input); - DCI.AddToWorklist(Res.getNode()); - Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res); + if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) + return false; // AVX512 Writemask clash. + V1 = DAG.getBitcast(ShuffleVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(ShuffleVT, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2, + DAG.getConstant(PermuteImm, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), /*AddTo*/ true); return true; } - // Attempt to blend with zero. - if (NumMaskElts <= 8 && - ((Subtarget.hasSSE41() && VT.is128BitVector()) || - (Subtarget.hasAVX() && VT.is256BitVector()))) { - // Convert VT to a type compatible with X86ISD::BLENDI. - // TODO - add 16i16 support (requires lane duplication). - MVT ShuffleVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v8i32; - else if (ShuffleVT == MVT::v2i64) - ShuffleVT = MVT::v4i32; - } else { - if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) - ShuffleVT = MVT::v8i16; - else if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v4f64; - else if (ShuffleVT == MVT::v8i32) - ShuffleVT = MVT::v8f32; - } - - if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, - /*Low*/ 0) && - NumMaskElts <= ShuffleVT.getVectorNumElements()) { - unsigned BlendMask = 0; - unsigned ShuffleSize = ShuffleVT.getVectorNumElements(); - unsigned MaskRatio = ShuffleSize / NumMaskElts; - - if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI) - return false; - - for (unsigned i = 0; i != ShuffleSize; ++i) - if (Mask[i / MaskRatio] < 0) - BlendMask |= 1u << i; + // Don't try to re-form single instruction chains under any circumstances now + // that we've done encoding canonicalization for them. + if (Depth < 2) + return false; - SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL); - Res = DAG.getBitcast(ShuffleVT, Input); + bool MaskContainsZeros = + any_of(Mask, [](int M) { return M == SM_SentinelZero; }); + + if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { + // If we have a single input lane-crossing shuffle then lower to VPERMV. + if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros && + ((Subtarget.hasAVX2() && + (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { + MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); + MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); + SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + DCI.AddToWorklist(VPermMask.getNode()); + Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); - Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero, - DAG.getConstant(BlendMask, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res); DCI.AddToWorklist(Res.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), /*AddTo*/ true); return true; } - } - // Attempt to combine to INSERTPS. - if (Subtarget.hasSSE41() && NumMaskElts == 4 && - (VT == MVT::v2f64 || VT == MVT::v4f32)) { - SmallBitVector Zeroable(4, false); - for (unsigned i = 0; i != NumMaskElts; ++i) - if (Mask[i] < 0) - Zeroable[i] = true; + // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero + // vector as the second source. + if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && + ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasVLX() && + (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || + MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { + // Adjust shuffle mask - replace SM_SentinelZero with second source index. + for (unsigned i = 0; i != NumMaskElts; ++i) + if (Mask[i] == SM_SentinelZero) + Mask[i] = NumMaskElts + i; + + MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); + MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); + SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + DCI.AddToWorklist(VPermMask.getNode()); + Res = DAG.getBitcast(MaskVT, V1); + DCI.AddToWorklist(Res.getNode()); + SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL); + DCI.AddToWorklist(Zero.getNode()); + Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } - unsigned InsertPSMask; - SDValue V1 = Input, V2 = Input; - if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, - Zeroable, Mask, DAG)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS) - return false; // Nothing to do! - V1 = DAG.getBitcast(MVT::v4f32, V1); + // If we have a dual input lane-crossing shuffle then lower to VPERMV3. + if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros && + ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasVLX() && + (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || + MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || + (Subtarget.hasBWI() && MaskVT == MVT::v32i16) || + (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || + (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || + (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { + MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); + MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); + SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + DCI.AddToWorklist(VPermMask.getNode()); + V1 = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(V1.getNode()); - V2 = DAG.getBitcast(MVT::v4f32, V2); + V2 = DAG.getBitcast(MaskVT, V2); DCI.AddToWorklist(V2.getNode()); - Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); DCI.AddToWorklist(Res.getNode()); DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), /*AddTo*/ true); return true; } - } - - // Don't try to re-form single instruction chains under any circumstances now - // that we've done encoding canonicalization for them. - if (Depth < 2) - return false; - - if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) return false; + } - bool MaskContainsZeros = - llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); + // See if we can combine a single input shuffle with zeros to a bit-mask, + // which is much simpler than any shuffle. + if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) && + isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && + DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { + APInt Zero = APInt::getNullValue(MaskEltSizeInBits); + APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits); + SmallBitVector UndefElts(NumMaskElts, false); + SmallVector<APInt, 64> EltBits(NumMaskElts, Zero); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) { + UndefElts[i] = true; + continue; + } + if (M == SM_SentinelZero) + continue; + EltBits[i] = AllOnes; + } + SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL); + DCI.AddToWorklist(BitMask.getNode()); + Res = DAG.getBitcast(MaskVT, V1); + DCI.AddToWorklist(Res.getNode()); + unsigned AndOpcode = + FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); + Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } // If we have a single input shuffle with different shuffle patterns in the // the 128-bit lanes use the variable mask to VPERMILPS. // TODO Combine other mask types at higher depths. - if (HasVariableMask && !MaskContainsZeros && + if (UnaryShuffle && HasVariableMask && !MaskContainsZeros && ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { SmallVector<SDValue, 16> VPermIdx; @@ -25283,7 +27079,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts); SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx); DCI.AddToWorklist(VPermMask.getNode()); - Res = DAG.getBitcast(MaskVT, Input); + Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); DCI.AddToWorklist(Res.getNode()); @@ -25292,17 +27088,60 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, return true; } + // With XOP, binary shuffles of 128/256-bit floating point vectors can combine + // to VPERMIL2PD/VPERMIL2PS. + if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() && + (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || + MaskVT == MVT::v8f32)) { + // VPERMIL2 Operation. + // Bits[3] - Match Bit. + // Bits[2:1] - (Per Lane) PD Shuffle Mask. + // Bits[2:0] - (Per Lane) PS Shuffle Mask. + unsigned NumLanes = MaskVT.getSizeInBits() / 128; + unsigned NumEltsPerLane = NumMaskElts / NumLanes; + SmallVector<int, 8> VPerm2Idx; + MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits()); + MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts); + unsigned M2ZImm = 0; + for (int M : Mask) { + if (M == SM_SentinelUndef) { + VPerm2Idx.push_back(-1); + continue; + } + if (M == SM_SentinelZero) { + M2ZImm = 2; + VPerm2Idx.push_back(8); + continue; + } + int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane); + Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index); + VPerm2Idx.push_back(Index); + } + V1 = DAG.getBitcast(MaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(MaskVT, V2); + DCI.AddToWorklist(V2.getNode()); + SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true); + DCI.AddToWorklist(VPerm2MaskOp.getNode()); + Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, + DAG.getConstant(M2ZImm, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + // If we have 3 or more shuffle instructions or a chain involving a variable // mask, we can replace them with a single PSHUFB instruction profitably. // Intel's manuals suggest only using PSHUFB if doing so replacing 5 // instructions, but in practice PSHUFB tends to be *very* fast so we're // more aggressive. - if ((Depth >= 3 || HasVariableMask) && - ((VT.is128BitVector() && Subtarget.hasSSSE3()) || - (VT.is256BitVector() && Subtarget.hasAVX2()) || - (VT.is512BitVector() && Subtarget.hasBWI()))) { + if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && + ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) || + (RootVT.is256BitVector() && Subtarget.hasAVX2()) || + (RootVT.is512BitVector() && Subtarget.hasBWI()))) { SmallVector<SDValue, 16> PSHUFBMask; - int NumBytes = VT.getSizeInBits() / 8; + int NumBytes = RootVT.getSizeInBits() / 8; int Ratio = NumBytes / NumMaskElts; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Ratio]; @@ -25319,7 +27158,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); - Res = DAG.getBitcast(ByteVT, Input); + Res = DAG.getBitcast(ByteVT, V1); DCI.AddToWorklist(Res.getNode()); SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); @@ -25330,10 +27169,135 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, return true; } + // With XOP, if we have a 128-bit binary input shuffle we can always combine + // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never + // slower than PSHUFB on targets that support both. + if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() && + Subtarget.hasXOP()) { + // VPPERM Mask Operation + // Bits[4:0] - Byte Index (0 - 31) + // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) + SmallVector<SDValue, 16> VPPERMMask; + int NumBytes = 16; + int Ratio = NumBytes / NumMaskElts; + for (int i = 0; i < NumBytes; ++i) { + int M = Mask[i / Ratio]; + if (M == SM_SentinelUndef) { + VPPERMMask.push_back(DAG.getUNDEF(MVT::i8)); + continue; + } + if (M == SM_SentinelZero) { + VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8)); + continue; + } + M = Ratio * M + i % Ratio; + VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8)); + } + MVT ByteVT = MVT::v16i8; + V1 = DAG.getBitcast(ByteVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(ByteVT, V2); + DCI.AddToWorklist(V2.getNode()); + SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask); + DCI.AddToWorklist(VPPERMMaskOp.getNode()); + Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + // Failed to find any combines. return false; } +// Attempt to constant fold all of the constant source ops. +// Returns true if the entire shuffle is folded to a constant. +// TODO: Extend this to merge multiple constant Ops and update the mask. +static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, + ArrayRef<int> Mask, SDValue Root, + bool HasVariableMask, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + MVT VT = Root.getSimpleValueType(); + + unsigned SizeInBits = VT.getSizeInBits(); + unsigned NumMaskElts = Mask.size(); + unsigned MaskSizeInBits = SizeInBits / NumMaskElts; + unsigned NumOps = Ops.size(); + + // Extract constant bits from each source op. + bool OneUseConstantOp = false; + SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps); + SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps); + for (unsigned i = 0; i != NumOps; ++i) { + SDValue SrcOp = Ops[i]; + OneUseConstantOp |= SrcOp.hasOneUse(); + if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i], + RawBitsOps[i])) + return false; + } + + // Only fold if at least one of the constants is only used once or + // the combined shuffle has included a variable mask shuffle, this + // is to avoid constant pool bloat. + if (!OneUseConstantOp && !HasVariableMask) + return false; + + // Shuffle the constant bits according to the mask. + SmallBitVector UndefElts(NumMaskElts, false); + SmallBitVector ZeroElts(NumMaskElts, false); + SmallBitVector ConstantElts(NumMaskElts, false); + SmallVector<APInt, 8> ConstantBitData(NumMaskElts, + APInt::getNullValue(MaskSizeInBits)); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) { + UndefElts[i] = true; + continue; + } else if (M == SM_SentinelZero) { + ZeroElts[i] = true; + continue; + } + assert(0 <= M && M < (int)(NumMaskElts * NumOps)); + + unsigned SrcOpIdx = (unsigned)M / NumMaskElts; + unsigned SrcMaskIdx = (unsigned)M % NumMaskElts; + + auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; + if (SrcUndefElts[SrcMaskIdx]) { + UndefElts[i] = true; + continue; + } + + auto &SrcEltBits = RawBitsOps[SrcOpIdx]; + APInt &Bits = SrcEltBits[SrcMaskIdx]; + if (!Bits) { + ZeroElts[i] = true; + continue; + } + + ConstantElts[i] = true; + ConstantBitData[i] = Bits; + } + assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts); + + // Create the constant data. + MVT MaskSVT; + if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64)) + MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits); + else + MaskSVT = MVT::getIntegerVT(MaskSizeInBits); + + MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts); + + SDLoc DL(Root); + SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); + DCI.AddToWorklist(CstOp.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp)); + return true; +} + /// \brief Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once @@ -25350,7 +27314,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, /// instructions, and replace them with the slightly more expensive SSSE3 /// PSHUFB instruction if available. We do this as the last combining step /// to ensure we avoid using PSHUFB if we can implement the shuffle with -/// a suitable short sequence of other instructions. The PHUFB will either +/// a suitable short sequence of other instructions. The PSHUFB will either /// use a register or have to read from memory and so is slightly (but only /// slightly) more expensive than the other shuffle instructions. /// @@ -25363,7 +27327,8 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. -static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, +static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, + int SrcOpIndex, SDValue Root, ArrayRef<int> RootMask, int Depth, bool HasVariableMask, SelectionDAG &DAG, @@ -25375,8 +27340,8 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return false; // Directly rip through bitcasts to find the underlying operand. - while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) - Op = Op.getOperand(0); + SDValue Op = SrcOps[SrcOpIndex]; + Op = peekThroughOneUseBitcasts(Op); MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) @@ -25393,8 +27358,27 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) return false; - assert(VT.getVectorNumElements() == OpMask.size() && - "Different mask size from vector size!"); + // Add the inputs to the Ops list, avoiding duplicates. + SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end()); + + int InputIdx0 = -1, InputIdx1 = -1; + for (int i = 0, e = Ops.size(); i < e; ++i) { + SDValue BC = peekThroughBitcasts(Ops[i]); + if (Input0 && BC == peekThroughBitcasts(Input0)) + InputIdx0 = i; + if (Input1 && BC == peekThroughBitcasts(Input1)) + InputIdx1 = i; + } + + if (Input0 && InputIdx0 < 0) { + InputIdx0 = SrcOpIndex; + Ops[SrcOpIndex] = Input0; + } + if (Input1 && InputIdx1 < 0) { + InputIdx1 = Ops.size(); + Ops.push_back(Input1); + } + assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && @@ -25424,6 +27408,17 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, } int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + + // Just insert the scaled root mask value if it references an input other + // than the SrcOp we're currently inserting. + if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || + (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { + Mask.push_back(RootMaskedIdx); + continue; + } + + RootMaskedIdx %= MaskWidth; + int OpIdx = RootMaskedIdx / OpRatio; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we @@ -25432,17 +27427,27 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, continue; } - // Ok, we have non-zero lanes, map them through. - Mask.push_back(OpMask[OpIdx] * OpRatio + - RootMaskedIdx % OpRatio); + // Ok, we have non-zero lanes, map them through to one of the Op's inputs. + int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio; + OpMaskedIdx %= MaskWidth; + + if (OpMask[OpIdx] < (int)OpMask.size()) { + assert(0 <= InputIdx0 && "Unknown target shuffle input"); + OpMaskedIdx += InputIdx0 * MaskWidth; + } else { + assert(0 <= InputIdx1 && "Unknown target shuffle input"); + OpMaskedIdx += InputIdx1 * MaskWidth; + } + + Mask.push_back(OpMaskedIdx); } // Handle the all undef/zero cases early. - if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) { + if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) { DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType())); return true; } - if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) { + if (all_of(Mask, [](int Idx) { return Idx < 0; })) { // TODO - should we handle the mixed zero/undef case as well? Just returning // a zero mask will lose information on undef elements possibly reducing // future combine possibilities. @@ -25451,30 +27456,40 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return true; } - int MaskSize = Mask.size(); - bool UseInput0 = std::any_of(Mask.begin(), Mask.end(), - [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; }); - bool UseInput1 = std::any_of(Mask.begin(), Mask.end(), - [MaskSize](int Idx) { return MaskSize <= Idx; }); - - // At the moment we can only combine unary shuffle mask cases. - if (UseInput0 && UseInput1) - return false; - else if (UseInput1) { - std::swap(Input0, Input1); - ShuffleVectorSDNode::commuteMask(Mask); + // Remove unused shuffle source ops. + SmallVector<SDValue, 8> UsedOps; + for (int i = 0, e = Ops.size(); i < e; ++i) { + int lo = UsedOps.size() * MaskWidth; + int hi = lo + MaskWidth; + if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + UsedOps.push_back(Ops[i]); + continue; + } + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; } - - assert(Input0 && "Shuffle with no inputs detected"); + assert(!UsedOps.empty() && "Shuffle with no inputs detected"); + Ops = UsedOps; HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); - // See if we can recurse into Input0 (if it's a target shuffle). - if (Op->isOnlyUserOf(Input0.getNode()) && - combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, - HasVariableMask, DAG, DCI, Subtarget)) + // See if we can recurse into each shuffle source op (if it's a target shuffle). + for (int i = 0, e = Ops.size(); i < e; ++i) + if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode())) + if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1, + HasVariableMask, DAG, DCI, Subtarget)) + return true; + + // Attempt to constant fold all of the constant source ops. + if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI, + Subtarget)) return true; + // We can only combine unary and binary shuffle mask cases. + if (Ops.size() > 2) + return false; + // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with sequential pairs of // elements, and shrink them to the half-width mask. It does this in a loop @@ -25485,7 +27500,14 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, Mask = std::move(WidenedMask); } - return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG, + // Canonicalization of binary shuffle masks to improve pattern matching by + // commuting the inputs. + if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(Ops[0], Ops[1]); + } + + return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG, DCI, Subtarget); } @@ -25612,7 +27634,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, Chain.push_back(V); - // Fallthrough! + LLVM_FALLTHROUGH; case ISD::BITCAST: V = V.getOperand(0); continue; @@ -25742,7 +27764,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; - switch (N.getOpcode()) { + unsigned Opcode = N.getOpcode(); + switch (Opcode) { case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: @@ -25750,6 +27773,17 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, assert(Mask.size() == 4); break; case X86ISD::UNPCKL: { + auto Op0 = N.getOperand(0); + auto Op1 = N.getOperand(1); + unsigned Opcode0 = Op0.getOpcode(); + unsigned Opcode1 = Op1.getOpcode(); + + // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single + // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization. + // TODO: Add other horizontal operations as required. + if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD) + return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0)); + // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE // moves upper half elements into the lower half part. For example: @@ -25767,9 +27801,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if (!VT.is128BitVector()) return SDValue(); - auto Op0 = N.getOperand(0); - auto Op1 = N.getOperand(1); - if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { + if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) { ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); unsigned NumElts = VT.getVectorNumElements(); @@ -25806,44 +27838,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); } - // Attempt to merge blend(insertps(x,y),zero). - if (V0.getOpcode() == X86ISD::INSERTPS || - V1.getOpcode() == X86ISD::INSERTPS) { - assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); - - // Determine which elements are known to be zero. - SmallVector<int, 8> TargetMask; - SmallVector<SDValue, 2> BlendOps; - if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps)) - return SDValue(); - - // Helper function to take inner insertps node and attempt to - // merge the blend with zero into its zero mask. - auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) { - if (V.getOpcode() != X86ISD::INSERTPS) - return SDValue(); - SDValue Op0 = V.getOperand(0); - SDValue Op1 = V.getOperand(1); - SDValue Op2 = V.getOperand(2); - unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue(); - - // Check each element of the blend node's target mask - must either - // be zeroable (and update the zero mask) or selects the element from - // the inner insertps node. - for (int i = 0; i != 4; ++i) - if (TargetMask[i] < 0) - InsertPSMask |= (1u << i); - else if (TargetMask[i] != (i + Offset)) - return SDValue(); - return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); - }; - - if (SDValue V = MergeInsertPSAndBlend(V0, 0)) - return V; - if (SDValue V = MergeInsertPSAndBlend(V1, 4)) - return V; + return SDValue(); + } + case X86ISD::MOVSD: + case X86ISD::MOVSS: { + bool isFloat = VT.isFloatingPoint(); + SDValue V0 = peekThroughBitcasts(N->getOperand(0)); + SDValue V1 = peekThroughBitcasts(N->getOperand(1)); + bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); + bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); + bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); + bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); + assert(!(isZero0 && isZero1) && "Zeroable shuffle detected."); + + // We often lower to MOVSD/MOVSS from integer as well as native float + // types; remove unnecessary domain-crossing bitcasts if we can to make it + // easier to combine shuffles later on. We've already accounted for the + // domain switching cost when we decided to lower with it. + if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { + MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) + : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); + V0 = DAG.getBitcast(NewVT, V0); + V1 = DAG.getBitcast(NewVT, V1); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1)); } + return SDValue(); } case X86ISD::INSERTPS: { @@ -25976,9 +27995,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, V.getOpcode() == X86ISD::PSHUFHW) && V.getOpcode() != N.getOpcode() && V.hasOneUse()) { - SDValue D = V.getOperand(0); - while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) - D = D.getOperand(0); + SDValue D = peekThroughOneUseBitcasts(V.getOperand(0)); if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); @@ -26017,31 +28034,32 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } -/// \brief Try to combine a shuffle into a target-specific add-sub node. +/// Returns true iff the shuffle node \p N can be replaced with ADDSUB +/// operation. If true is returned then the operands of ADDSUB operation +/// are written to the parameters \p Opnd0 and \p Opnd1. /// -/// We combine this directly on the abstract vector shuffle nodes so it is -/// easier to generically match. We also insert dummy vector shuffle nodes for -/// the operands which explicitly discard the lanes which are unused by this -/// operation to try to flow through the rest of the combiner the fact that -/// they're unused. -static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - SDLoc DL(N); +/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes +/// so it is easier to generically match. We also insert dummy vector shuffle +/// nodes for the operands which explicitly discard the lanes which are unused +/// by this operation to try to flow through the rest of the combiner +/// the fact that they're unused. +static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget, + SDValue &Opnd0, SDValue &Opnd1) { + EVT VT = N->getValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && - (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) - return SDValue(); + (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) && + (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64))) + return false; // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); + return false; - auto *SVN = cast<ShuffleVectorSDNode>(N); - SmallVector<int, 8> Mask; - for (int M : SVN->getMask()) - Mask.push_back(M); + ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask(); + SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end()); SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); @@ -26052,27 +28070,102 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget, ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) - return SDValue(); + return false; // If there are other uses of these operations we can't fold them. if (!V1->hasOneUse() || !V2->hasOneUse()) - return SDValue(); + return false; // Ensure that both operations have the same operands. Note that we can // commute the FADD operands. SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) - return SDValue(); + return false; // We're looking for blends between FADD and FSUB nodes. We insist on these // nodes being lined up in a specific expected pattern. if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || - isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) + isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) || + isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23, + 8, 25, 10, 27, 12, 29, 14, 31}))) + return false; + + Opnd0 = LHS; + Opnd1 = RHS; + return true; +} + +/// \brief Try to combine a shuffle into a target-specific add-sub or +/// mul-add-sub node. +static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue Opnd0, Opnd1; + if (!isAddSub(N, Subtarget, Opnd0, Opnd1)) return SDValue(); - return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // Try to generate X86ISD::FMADDSUB node here. + SDValue Opnd2; + if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2)) + return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); + + // Do not generate X86ISD::ADDSUB node for 512-bit types even though + // the ADDSUB idiom has been successfully recognized. There are no known + // X86 targets with 512-bit ADDSUB instructions! + if (VT.is512BitVector()) + return SDValue(); + + return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); +} + +// We are looking for a shuffle where both sources are concatenated with undef +// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so +// if we can express this as a single-source shuffle, that's preferable. +static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N)) + return SDValue(); + + EVT VT = N->getValueType(0); + + // We only care about shuffles of 128/256-bit vectors of 32/64-bit values. + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + + if (VT.getVectorElementType() != MVT::i32 && + VT.getVectorElementType() != MVT::i64 && + VT.getVectorElementType() != MVT::f32 && + VT.getVectorElementType() != MVT::f64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check that both sources are concats with undef. + if (N0.getOpcode() != ISD::CONCAT_VECTORS || + N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || + N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() || + !N1.getOperand(1).isUndef()) + return SDValue(); + + // Construct the new shuffle mask. Elements from the first source retain their + // index, but elements from the second source no longer need to skip an undef. + SmallVector<int, 8> Mask; + int NumElts = VT.getVectorNumElements(); + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + for (int Elt : SVOp->getMask()) + Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2)); + + SDLoc DL(N); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0), + N1.getOperand(0)); + return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); } static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, @@ -26089,14 +28182,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. if (TLI.isTypeLegal(VT)) - if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) + if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; - // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() && - N->getOpcode() == ISD::VECTOR_SHUFFLE) - return combineShuffle256(N, DAG, DCI, Subtarget); - // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // @@ -26127,13 +28215,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, bool CanFold = false; switch (Opcode) { default : break; - case ISD::ADD : - case ISD::FADD : - case ISD::SUB : - case ISD::FSUB : - case ISD::MUL : - case ISD::FMUL : - CanFold = true; + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + // isOperationLegal lies for integer ops on floating point types. + CanFold = VT.isInteger(); + break; + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + // isOperationLegal lies for floating point ops on integer types. + CanFold = VT.isFloatingPoint(); + break; } unsigned SVTNumElts = SVT.getVectorNumElements(); @@ -26162,9 +28255,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) return LD; + // For AVX2, we sometimes want to combine + // (vector_shuffle <mask> (concat_vectors t1, undef) + // (concat_vectors t2, undef)) + // Into: + // (vector_shuffle <mask> (concat_vectors t1, t2), undef) + // Since the latter can be efficiently lowered with VPERMD/VPERMQ + if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget)) + return ShufConcat; + if (isTargetShuffle(N->getOpcode())) { - if (SDValue Shuffle = - combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget)) + SDValue Op(N, 0); + if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget)) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle @@ -26174,8 +28276,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // a particular chain. SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, - /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. } @@ -26305,11 +28407,10 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, } // Convert a bitcasted integer logic operation that has one bitcasted - // floating-point operand and one constant operand into a floating-point - // logic operation. This may create a load of the constant, but that is - // cheaper than materializing the constant in an integer register and - // transferring it to an SSE register or transferring the SSE operand to - // integer register and back. + // floating-point operand into a floating-point logic operation. This may + // create a load of a constant, but that is cheaper than materializing the + // constant in an integer register and transferring it to an SSE register or + // transferring the SSE operand to integer register and back. unsigned FPOpcode; switch (N0.getOpcode()) { case ISD::AND: FPOpcode = X86ISD::FAND; break; @@ -26317,25 +28418,238 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, case ISD::XOR: FPOpcode = X86ISD::FXOR; break; default: return SDValue(); } - if (((Subtarget.hasSSE1() && VT == MVT::f32) || - (Subtarget.hasSSE2() && VT == MVT::f64)) && - isa<ConstantSDNode>(N0.getOperand(1)) && - N0.getOperand(0).getOpcode() == ISD::BITCAST && - N0.getOperand(0).getOperand(0).getValueType() == VT) { - SDValue N000 = N0.getOperand(0).getOperand(0); - SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); - return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); + + if (!((Subtarget.hasSSE1() && VT == MVT::f32) || + (Subtarget.hasSSE2() && VT == MVT::f64))) + return SDValue(); + + SDValue LogicOp0 = N0.getOperand(0); + SDValue LogicOp1 = N0.getOperand(1); + SDLoc DL0(N0); + + // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y)) + if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST && + LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT && + !isa<ConstantSDNode>(LogicOp0.getOperand(0))) { + SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1); + return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1); + } + // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y) + if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST && + LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT && + !isa<ConstantSDNode>(LogicOp1.getOperand(0))) { + SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0); + return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0); } return SDValue(); } +// Match a binop + shuffle pyramid that represents a horizontal reduction over +// the elements of a vector. +// Returns the vector that is being reduced on, or SDValue() if a reduction +// was not matched. +static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) { + // The pattern must end in an extract from index 0. + if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) || + !isNullConstant(Extract->getOperand(1))) + return SDValue(); + + unsigned Stages = + Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements()); + + SDValue Op = Extract->getOperand(0); + // At each stage, we're looking for something that looks like: + // %s = shufflevector <8 x i32> %op, <8 x i32> undef, + // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, + // i32 undef, i32 undef, i32 undef, i32 undef> + // %a = binop <8 x i32> %op, %s + // Where the mask changes according to the stage. E.g. for a 3-stage pyramid, + // we expect something like: + // <4,5,6,7,u,u,u,u> + // <2,3,u,u,u,u,u,u> + // <1,u,u,u,u,u,u,u> + for (unsigned i = 0; i < Stages; ++i) { + if (Op.getOpcode() != BinOp) + return SDValue(); + + ShuffleVectorSDNode *Shuffle = + dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode()); + if (Shuffle) { + Op = Op.getOperand(1); + } else { + Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode()); + Op = Op.getOperand(0); + } + + // The first operand of the shuffle should be the same as the other operand + // of the add. + if (!Shuffle || (Shuffle->getOperand(0) != Op)) + return SDValue(); + + // Verify the shuffle has the expected (at this stage of the pyramid) mask. + for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index) + if (Shuffle->getMaskElt(Index) != MaskEnd + Index) + return SDValue(); + } + + return Op; +} + +// Given a select, detect the following pattern: +// 1: %2 = zext <N x i8> %0 to <N x i32> +// 2: %3 = zext <N x i8> %1 to <N x i32> +// 3: %4 = sub nsw <N x i32> %2, %3 +// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N] +// 5: %6 = sub nsw <N x i32> zeroinitializer, %4 +// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6 +// This is useful as it is the input into a SAD pattern. +static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, + SDValue &Op1) { + // Check the condition of the select instruction is greater-than. + SDValue SetCC = Select->getOperand(0); + if (SetCC.getOpcode() != ISD::SETCC) + return false; + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); + if (CC != ISD::SETGT) + return false; + + SDValue SelectOp1 = Select->getOperand(1); + SDValue SelectOp2 = Select->getOperand(2); + + // The second operand of the select should be the negation of the first + // operand, which is implemented as 0 - SelectOp1. + if (!(SelectOp2.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) && + SelectOp2.getOperand(1) == SelectOp1)) + return false; + + // The first operand of SetCC is the first operand of the select, which is the + // difference between the two input vectors. + if (SetCC.getOperand(0) != SelectOp1) + return false; + + // The second operand of the comparison can be either -1 or 0. + if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || + ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) + return false; + + // The first operand of the select is the difference between the two input + // vectors. + if (SelectOp1.getOpcode() != ISD::SUB) + return false; + + Op0 = SelectOp1.getOperand(0); + Op1 = SelectOp1.getOperand(1); + + // Check if the operands of the sub are zero-extended from vectors of i8. + if (Op0.getOpcode() != ISD::ZERO_EXTEND || + Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || + Op1.getOpcode() != ISD::ZERO_EXTEND || + Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) + return false; + + return true; +} + +// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs +// to these zexts. +static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, + const SDValue &Zext1, const SDLoc &DL) { + + // Find the appropriate width for the PSADBW. + EVT InVT = Zext0.getOperand(0).getValueType(); + unsigned RegSize = std::max(128u, InVT.getSizeInBits()); + + // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we + // fill in the missing vector elements with 0. + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT)); + Ops[0] = Zext0.getOperand(0); + MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); + SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + Ops[0] = Zext1.getOperand(0); + SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + + // Actually build the SAD + MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); + return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); +} + +static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // PSADBW is only supported on SSE2 and up. + if (!Subtarget.hasSSE2()) + return SDValue(); + + // Verify the type we're extracting from is appropriate + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. + EVT VT = Extract->getOperand(0).getValueType(); + if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) + return SDValue(); + + unsigned RegSize = 128; + if (Subtarget.hasBWI()) + RegSize = 512; + else if (Subtarget.hasAVX2()) + RegSize = 256; + + // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // TODO: We should be able to handle larger vectors by splitting them before + // feeding them into several SADs, and then reducing over those. + if (VT.getSizeInBits() / 4 > RegSize) + return SDValue(); + + // Match shuffle + add pyramid. + SDValue Root = matchBinOpReduction(Extract, ISD::ADD); + + // If there was a match, we want Root to be a select that is the root of an + // abs-diff pattern. + if (!Root || (Root.getOpcode() != ISD::VSELECT)) + return SDValue(); + + // Check whether we have an abs-diff pattern feeding into the select. + SDValue Zext0, Zext1; + if (!detectZextAbsDiff(Root, Zext0, Zext1)) + return SDValue(); + + // Create the SAD instruction + SDLoc DL(Extract); + SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); + + // If the original vector was wider than 8 elements, sum over the results + // in the SAD vector. + unsigned Stages = Log2_32(VT.getVectorNumElements()); + MVT SadVT = SAD.getSimpleValueType(); + if (Stages > 3) { + unsigned SadElems = SadVT.getVectorNumElements(); + + for(unsigned i = Stages - 3; i > 0; --i) { + SmallVector<int, 16> Mask(SadElems, -1); + for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) + Mask[j] = MaskEnd + j; + + SDValue Shuffle = + DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask); + SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle); + } + } + + // Return the lowest i32. + MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32); + SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD, + Extract->getOperand(1)); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading /// scalars back, while for x64 we should use 64-bit extracts and shifts. static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; @@ -26347,7 +28661,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, InputVector.getValueType() == MVT::v2i32 && isa<ConstantSDNode>(N->getOperand(1)) && N->getConstantOperandVal(1) == 0) { - SDValue MMXSrc = InputVector.getNode()->getOperand(0); + SDValue MMXSrc = InputVector.getOperand(0); // The bitcast source is a direct mmx result. if (MMXSrc.getValueType() == MVT::x86mmx) @@ -26366,6 +28680,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } + + // Check whether this extract is the root of a sum of absolute differences + // pattern. This has to be done here because we really want it to happen + // pre-legalization, + if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) + return SAD; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) @@ -26467,6 +28788,310 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// If a vector select has an operand that is -1 or 0, try to simplify the +/// select to a bitwise logic operation. +static SDValue +combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + EVT VT = LHS.getValueType(); + EVT CondVT = Cond.getValueType(); + SDLoc DL(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (N->getOpcode() != ISD::VSELECT) + return SDValue(); + + assert(CondVT.isVector() && "Vector select expects a vector selector!"); + + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + // Check if the first operand is all zeros and Cond type is vXi1. + // This situation only applies to avx512. + if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && + CondVT.getVectorElementType() == MVT::i1) { + //Invert the cond to not(cond) : xor(op,allones)=not(op) + SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()), + DL, CondVT)); + //Vselect cond, op1, op2 = Vselect not(cond), op2, op1 + return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); + } + + // To use the condition operand as a bitwise mask, it must have elements that + // are the same size as the select elements. Ie, the condition operand must + // have already been promoted from the IR select condition type <N x i1>. + // Don't check if the types themselves are equal because that excludes + // vector floating-point selects. + if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) + return SDValue(); + + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + // Try to invert the condition if true value is not all 1s and false value is + // not all 0s. + if (!TValIsAllOnes && !FValIsAllZeros && + // Check if the selector will be produced by CMPP*/PCMP*. + Cond.getOpcode() == ISD::SETCC && + // Check if SETCC has already been promoted. + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == + CondVT) { + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); + + if (TValIsAllZeros || FValIsAllOnes) { + SDValue CC = Cond.getOperand(2); + ISD::CondCode NewCC = + ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), + Cond.getOperand(0).getValueType().isInteger()); + Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), + NewCC); + std::swap(LHS, RHS); + TValIsAllOnes = FValIsAllOnes; + FValIsAllZeros = TValIsAllZeros; + } + } + + // vselect Cond, 111..., 000... -> Cond + if (TValIsAllOnes && FValIsAllZeros) + return DAG.getBitcast(VT, Cond); + + if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT)) + return SDValue(); + + // vselect Cond, 111..., X -> or Cond, X + if (TValIsAllOnes) { + SDValue CastRHS = DAG.getBitcast(CondVT, RHS); + SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS); + return DAG.getBitcast(VT, Or); + } + + // vselect Cond, X, 000... -> and Cond, X + if (FValIsAllZeros) { + SDValue CastLHS = DAG.getBitcast(CondVT, LHS); + SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS); + return DAG.getBitcast(VT, And); + } + + return SDValue(); +} + +static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + SDLoc DL(N); + + auto *TrueC = dyn_cast<ConstantSDNode>(LHS); + auto *FalseC = dyn_cast<ConstantSDNode>(RHS); + if (!TrueC || !FalseC) + return SDValue(); + + // Don't do this for crazy integer types. + if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) + return SDValue(); + + // If this is efficiently invertible, canonicalize the LHSC/RHSC values + // so that TrueC (the true value) is larger than FalseC. + bool NeedsCondInvert = false; + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && + // Efficiently invertible. + (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. + (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. + isa<ConstantSDNode>(Cond.getOperand(1))))) { + NeedsCondInvert = true; + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. + if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, DL, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, + DAG.getConstant(ShAmt, DL, MVT::i8)); + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. + if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, DL, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); + return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) + Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: + break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue(); + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, DL, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, DL, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + return Cond; + } + } + + return SDValue(); +} + +// If this is a bitcasted op that can be represented as another type, push the +// the bitcast to the inputs. This allows more opportunities for pattern +// matching masked instructions. This is called when we know that the operation +// is used as one of the inputs of a vselect. +static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // Make sure we have a bitcast. + if (OrigOp.getOpcode() != ISD::BITCAST) + return false; + + SDValue Op = OrigOp.getOperand(0); + + // If the operation is used by anything other than the bitcast, we shouldn't + // do this combine as that would replicate the operation. + if (!Op.hasOneUse()) + return false; + + MVT VT = OrigOp.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + SDLoc DL(Op.getNode()); + + auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1, + SDValue Op2) { + Op0 = DAG.getBitcast(VT, Op0); + DCI.AddToWorklist(Op0.getNode()); + Op1 = DAG.getBitcast(VT, Op1); + DCI.AddToWorklist(Op1.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2)); + return true; + }; + + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { + case X86ISD::PALIGNR: + // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. + if (!VT.is128BitVector()) + return false; + Opcode = X86ISD::VALIGN; + LLVM_FALLTHROUGH; + case X86ISD::VALIGN: { + if (EltVT != MVT::i32 && EltVT != MVT::i64) + return false; + uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); + unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + // Make sure we can represent the same shift with the new VT. + if ((ShiftAmt % EltSize) != 0) + return false; + Imm = ShiftAmt / EltSize; + return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), + DAG.getConstant(Imm, DL, MVT::i8)); + } + case X86ISD::SHUF128: { + if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) + return false; + // Only change element size, not type. + if (VT.isInteger() != Op.getSimpleValueType().isInteger()) + return false; + return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), + Op.getOperand(2)); + } + case ISD::INSERT_SUBVECTOR: { + unsigned EltSize = EltVT.getSizeInBits(); + if (EltSize != 32 && EltSize != 64) + return false; + MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); + // Only change element size, not type. + if (VT.isInteger() != OpEltVT.isInteger()) + return false; + uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; + SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0)); + DCI.AddToWorklist(Op0.getNode()); + // Op1 needs to be bitcasted to a smaller vector with the same element type. + SDValue Op1 = Op.getOperand(1); + MVT Op1VT = MVT::getVectorVT(EltVT, + Op1.getSimpleValueType().getSizeInBits() / EltSize); + Op1 = DAG.getBitcast(Op1VT, Op1); + DCI.AddToWorklist(Op1.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0, Op1, + DAG.getConstant(Imm, DL, MVT::i8))); + return true; + } + case ISD::EXTRACT_SUBVECTOR: { + unsigned EltSize = EltVT.getSizeInBits(); + if (EltSize != 32 && EltSize != 64) + return false; + MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); + // Only change element size, not type. + if (VT.isInteger() != OpEltVT.isInteger()) + return false; + uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; + // Op0 needs to be bitcasted to a larger vector with the same element type. + SDValue Op0 = Op.getOperand(0); + MVT Op0VT = MVT::getVectorVT(EltVT, + Op0.getSimpleValueType().getSizeInBits() / EltSize); + Op0 = DAG.getBitcast(Op0VT, Op0); + DCI.AddToWorklist(Op0.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0, + DAG.getConstant(Imm, DL, MVT::i8))); + return true; + } + } + + return false; +} + /// Do target-specific dag combines on SELECT and VSELECT nodes. static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -26477,6 +29102,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); + EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have SSE[12] support, try to form min/max nodes. SSE min/max @@ -26625,117 +29251,24 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } - EVT CondVT = Cond.getValueType(); - if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() && - CondVT.getVectorElementType() == MVT::i1) { - // v16i8 (select v16i1, v16i8, v16i8) does not have a proper - // lowering on KNL. In this case we convert it to - // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. - // The same situation for all 128 and 256-bit vectors of i8 and i16. - // Since SKX these selects have a proper lowering. - EVT OpVT = LHS.getValueType(); - if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && - (OpVT.getVectorElementType() == MVT::i8 || - OpVT.getVectorElementType() == MVT::i16) && - !(Subtarget.hasBWI() && Subtarget.hasVLX())) { - Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); - DCI.AddToWorklist(Cond.getNode()); - return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); - } + // v16i8 (select v16i1, v16i8, v16i8) does not have a proper + // lowering on KNL. In this case we convert it to + // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. + // The same situation for all 128 and 256-bit vectors of i8 and i16. + // Since SKX these selects have a proper lowering. + if (Subtarget.hasAVX512() && CondVT.isVector() && + CondVT.getVectorElementType() == MVT::i1 && + (VT.is128BitVector() || VT.is256BitVector()) && + (VT.getVectorElementType() == MVT::i8 || + VT.getVectorElementType() == MVT::i16) && + !(Subtarget.hasBWI() && Subtarget.hasVLX())) { + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); + DCI.AddToWorklist(Cond.getNode()); + return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS); } - // If this is a select between two integer constants, try to do some - // optimizations. - if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { - if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) - // Don't do this for crazy integer types. - if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { - // If this is efficiently invertible, canonicalize the LHSC/RHSC values - // so that TrueC (the true value) is larger than FalseC. - bool NeedsCondInvert = false; - - if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && - // Efficiently invertible. - (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. - (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. - isa<ConstantSDNode>(Cond.getOperand(1))))) { - NeedsCondInvert = true; - std::swap(TrueC, FalseC); - } - - // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. - if (FalseC->getAPIntValue() == 0 && - TrueC->getAPIntValue().isPowerOf2()) { - if (NeedsCondInvert) // Invert the condition if needed. - Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); - - // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); - - unsigned ShAmt = TrueC->getAPIntValue().logBase2(); - return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, - DAG.getConstant(ShAmt, DL, MVT::i8)); - } - - // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. - if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { - if (NeedsCondInvert) // Invert the condition if needed. - Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); - - // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, - FalseC->getValueType(0), Cond); - return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, - SDValue(FalseC, 0)); - } - // Optimize cases that will turn into an LEA instruction. This requires - // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). - if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { - uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); - if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; - - bool isFastMultiplier = false; - if (Diff < 10) { - switch ((unsigned char)Diff) { - default: break; - case 1: // result = add base, cond - case 2: // result = lea base( , cond*2) - case 3: // result = lea base(cond, cond*2) - case 4: // result = lea base( , cond*4) - case 5: // result = lea base(cond, cond*4) - case 8: // result = lea base( , cond*8) - case 9: // result = lea base(cond, cond*8) - isFastMultiplier = true; - break; - } - } - - if (isFastMultiplier) { - APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); - if (NeedsCondInvert) // Invert the condition if needed. - Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); - - // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), - Cond); - // Scale the condition by the difference. - if (Diff != 1) - Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, - DAG.getConstant(Diff, DL, - Cond.getValueType())); - - // Add the base if non-zero. - if (FalseC->getAPIntValue() != 0) - Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, - SDValue(FalseC, 0)); - return Cond; - } - } - } - } + if (SDValue V = combineSelectOfTwoConstants(N, DAG)) + return V; // Canonicalize max and min: // (x > y) ? x : y -> (x >= y) ? x : y @@ -26832,53 +29365,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, } } - // Simplify vector selection if condition value type matches vselect - // operand type - if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { - assert(Cond.getValueType().isVector() && - "vector select expects a vector selector!"); - - bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); - bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); - - // Try invert the condition if true value is not all 1s and false value - // is not all 0s. - if (!TValIsAllOnes && !FValIsAllZeros && - // Check if the selector will be produced by CMPP*/PCMP* - Cond.getOpcode() == ISD::SETCC && - // Check if SETCC has already been promoted - TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == - CondVT) { - bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); - bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); - - if (TValIsAllZeros || FValIsAllOnes) { - SDValue CC = Cond.getOperand(2); - ISD::CondCode NewCC = - ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), - Cond.getOperand(0).getValueType().isInteger()); - Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); - std::swap(LHS, RHS); - TValIsAllOnes = FValIsAllOnes; - FValIsAllZeros = TValIsAllZeros; - } - } - - if (TValIsAllOnes || FValIsAllZeros) { - SDValue Ret; - - if (TValIsAllOnes && FValIsAllZeros) - Ret = Cond; - else if (TValIsAllOnes) - Ret = - DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS)); - else if (FValIsAllZeros) - Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, - DAG.getBitcast(CondVT, LHS)); - - return DAG.getBitcast(VT, Ret); - } - } + if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) + return V; // If this is a *dynamic* select (non-constant condition) and we can match // this node with one of the variable blend instructions, restructure the @@ -26887,7 +29375,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { - unsigned BitWidth = Cond.getValueType().getScalarSizeInBits(); + unsigned BitWidth = Cond.getScalarValueSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) @@ -26965,6 +29453,17 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, } } + // Look for vselects with LHS/RHS being bitcasted from an operation that + // can be executed on another type. Push the bitcast to the inputs of + // the operation. This exposes opportunities for using masking instructions. + if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() && + CondVT.getVectorElementType() == MVT::i1) { + if (combineBitcastForMaskedOp(LHS, DAG, DCI)) + return SDValue(N, 0); + if (combineBitcastForMaskedOp(RHS, DAG, DCI)) + return SDValue(N, 0); + } + return SDValue(); } @@ -26981,6 +29480,12 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); + // Can't replace the cmp if it has more uses than the one we're looking at. + // FIXME: We would like to be able to handle this, but would need to make sure + // all uses were updated. + if (!Cmp.hasOneUse()) + return SDValue(); + // This only applies to variations of the common case: // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) @@ -27088,7 +29593,6 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || SetCC.getOpcode() == ISD::TRUNCATE || - SetCC.getOpcode() == ISD::AssertZext || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; @@ -27114,7 +29618,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { break; assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!"); - // FALL THROUGH + LLVM_FALLTHROUGH; case X86ISD::SETCC: // Set the condition code or opposite one if necessary. CC = X86::CondCode(SetCC.getConstantOperandVal(0)); @@ -27187,7 +29691,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, case ISD::AND: case X86ISD::AND: isAnd = true; - // fallthru + LLVM_FALLTHROUGH; case ISD::OR: case X86ISD::OR: SetCC0 = Cond->getOperand(0); @@ -27270,8 +29774,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { - Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(CC, DL, MVT::i8), Cond); + Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); @@ -27287,8 +29790,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { - Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(CC, DL, MVT::i8), Cond); + Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, @@ -27325,8 +29827,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); - Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(CC, DL, MVT::i8), Cond); + Cond = getSETCC(CC, Cond, DL ,DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); @@ -27525,10 +30026,17 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { /// generate pmullw+pmulhuw for it (MULU16 mode). static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // pmulld is supported since SSE41. It is better to use pmulld - // instead of pmullw+pmulhw. + // Check for legality // pmullw/pmulhw are not supported by SSE. - if (Subtarget.hasSSE41() || !Subtarget.hasSSE2()) + if (!Subtarget.hasSSE2()) + return SDValue(); + + // Check for profitability + // pmulld is supported since SSE41. It is better to use pmulld + // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than + // the expansion. + bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); + if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) return SDValue(); ShrinkMode Mode; @@ -27591,7 +30099,12 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // <4 x i16> undef). // // Legalize the operands of mul. - SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(), + // FIXME: We may be able to handle non-concatenated vectors by insertion. + unsigned ReducedSizeInBits = ReducedVT.getSizeInBits(); + if ((RegSize % ReducedSizeInBits) != 0) + return SDValue(); + + SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits, DAG.getUNDEF(ReducedVT)); Ops[0] = NewN0; NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); @@ -27851,7 +30364,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { const APInt &ShiftAmt = AmtSplat->getAPIntValue(); unsigned MaxAmount = - VT.getSimpleVT().getVectorElementType().getSizeInBits(); + VT.getSimpleVT().getScalarSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s // if the shift amount is bigger than or equal to @@ -27883,6 +30396,45 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) && + "Unexpected opcode"); + EVT VT = N->getValueType(0); + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + + // This fails for mask register (vXi1) shifts. + if ((NumBitsPerElt % 8) != 0) + return SDValue(); + + // Out of range logical bit shifts are guaranteed to be zero. + APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); + if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + + // Shift N0 by zero -> N0. + if (!ShiftVal) + return N->getOperand(0); + + // Shift zero -> zero. + if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + + // We can decode 'whole byte' logical bit shifts as shuffles. + if ((ShiftVal.getZExtValue() % 8) == 0) { + SDValue Op(N, 0); + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. + } + + return SDValue(); +} + /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for /// OR -> CMPNEQSS. @@ -27943,7 +30495,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { - SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, + SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); if (N->getValueType(0) != MVT::i1) @@ -27995,9 +30547,7 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (VT != MVT::v2i64 && VT != MVT::v4i64 && - VT != MVT::v8i64 && VT != MVT::v16i32 && - VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX + if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64) return SDValue(); // Canonicalize XOR to the left. @@ -28111,95 +30661,6 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, } } -static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - - // A vector zext_in_reg may be represented as a shuffle, - // feeding into a bitcast (this represents anyext) feeding into - // an and with a mask. - // We'd like to try to combine that into a shuffle with zero - // plus a bitcast, removing the and. - if (N0.getOpcode() != ISD::BITCAST || - N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); - - // The other side of the AND should be a splat of 2^C, where C - // is the number of bits in the source type. - N1 = peekThroughBitcasts(N1); - if (N1.getOpcode() != ISD::BUILD_VECTOR) - return SDValue(); - BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1); - - ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0)); - EVT SrcType = Shuffle->getValueType(0); - - // We expect a single-source shuffle - if (!Shuffle->getOperand(1)->isUndef()) - return SDValue(); - - unsigned SrcSize = SrcType.getScalarSizeInBits(); - unsigned NumElems = SrcType.getVectorNumElements(); - - APInt SplatValue, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (!Vector->isConstantSplat(SplatValue, SplatUndef, - SplatBitSize, HasAnyUndefs)) - return SDValue(); - - unsigned ResSize = N1.getValueType().getScalarSizeInBits(); - // Make sure the splat matches the mask we expect - if (SplatBitSize > ResSize || - (SplatValue + 1).exactLogBase2() != (int)SrcSize) - return SDValue(); - - // Make sure the input and output size make sense - if (SrcSize >= ResSize || ResSize % SrcSize) - return SDValue(); - - // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> - // The number of u's between each two values depends on the ratio between - // the source and dest type. - unsigned ZextRatio = ResSize / SrcSize; - bool IsZext = true; - for (unsigned i = 0; i != NumElems; ++i) { - if (i % ZextRatio) { - if (Shuffle->getMaskElt(i) > 0) { - // Expected undef - IsZext = false; - break; - } - } else { - if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) { - // Expected element number - IsZext = false; - break; - } - } - } - - if (!IsZext) - return SDValue(); - - // Ok, perform the transformation - replace the shuffle with - // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero - // (instead of undef) where the k elements come from the zero vector. - SmallVector<int, 8> Mask; - for (unsigned i = 0; i != NumElems; ++i) - if (i % ZextRatio) - Mask.push_back(NumElems); - else - Mask.push_back(i / ZextRatio); - - SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, - Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask); - return DAG.getBitcast(N0.getValueType(), NewShuffle); -} - /// If both input operands of a logic op are being cast from floating point /// types, try to convert this into a floating point logic node to avoid /// unnecessary moves from SSE to integer registers. @@ -28255,7 +30716,7 @@ static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) { // masked compare nodes, so they should not make it here. EVT VT0 = Op0.getValueType(); EVT VT1 = Op1.getValueType(); - unsigned EltBitWidth = VT0.getScalarType().getSizeInBits(); + unsigned EltBitWidth = VT0.getScalarSizeInBits(); if (VT0 != VT1 || EltBitWidth == 8) return SDValue(); @@ -28277,9 +30738,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget)) - return Zext; - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; @@ -28297,6 +30755,17 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); SDLoc DL(N); + // Attempt to recursively combine a bitmask AND with shuffles. + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + SDValue Op(N, 0); + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. + } + // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT != MVT::i32 && VT != MVT::i64) @@ -28372,7 +30841,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, // Validate that the Mask operand is a vector sra node. // FIXME: what to do for bytes, since there is a psignb/pblendvb, but // there is no psrai.b - unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + unsigned EltBits = MaskVT.getScalarSizeInBits(); unsigned SraAmt = ~0; if (Mask.getOpcode() == ISD::SRA) { if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) @@ -28450,6 +30919,114 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT, Mask); } +// Helper function for combineOrCmpEqZeroToCtlzSrl +// Transforms: +// seteq(cmp x, 0) +// into: +// srl(ctlz x), log2(bitsize(x)) +// Input pattern is checked by caller. +static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, + SelectionDAG &DAG) { + SDValue Cmp = Op.getOperand(1); + EVT VT = Cmp.getOperand(0).getValueType(); + unsigned Log2b = Log2_32(VT.getSizeInBits()); + SDLoc dl(Op); + SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0)); + // The result of the shift is true or false, and on X86, the 32-bit + // encoding of shr and lzcnt is more desirable. + SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32); + SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc, + DAG.getConstant(Log2b, dl, VT)); + return DAG.getZExtOrTrunc(Scc, dl, ExtTy); +} + +// Try to transform: +// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0)))) +// into: +// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x)) +// Will also attempt to match more generic cases, eg: +// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0))) +// Only applies if the target supports the FastLZCNT feature. +static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast()) + return SDValue(); + + auto isORCandidate = [](SDValue N) { + return (N->getOpcode() == ISD::OR && N->hasOneUse()); + }; + + // Check the zero extend is extending to 32-bit or more. The code generated by + // srl(ctlz) for 16-bit or less variants of the pattern would require extra + // instructions to clear the upper bits. + if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) || + !isORCandidate(N->getOperand(0))) + return SDValue(); + + // Check the node matches: setcc(eq, cmp 0) + auto isSetCCCandidate = [](SDValue N) { + return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && + X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && + N->getOperand(1).getOpcode() == X86ISD::CMP && + N->getOperand(1).getConstantOperandVal(1) == 0 && + N->getOperand(1).getValueType().bitsGE(MVT::i32); + }; + + SDNode *OR = N->getOperand(0).getNode(); + SDValue LHS = OR->getOperand(0); + SDValue RHS = OR->getOperand(1); + + // Save nodes matching or(or, setcc(eq, cmp 0)). + SmallVector<SDNode *, 2> ORNodes; + while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) || + (isORCandidate(RHS) && isSetCCCandidate(LHS)))) { + ORNodes.push_back(OR); + OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode(); + LHS = OR->getOperand(0); + RHS = OR->getOperand(1); + } + + // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)). + if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) || + !isORCandidate(SDValue(OR, 0))) + return SDValue(); + + // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it + // to + // or(srl(ctlz),srl(ctlz)). + // The dag combiner can then fold it into: + // srl(or(ctlz, ctlz)). + EVT VT = OR->getValueType(0); + SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG); + SDValue Ret, NewRHS; + if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG))) + Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS); + + if (!Ret) + return SDValue(); + + // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern. + while (ORNodes.size() > 0) { + OR = ORNodes.pop_back_val(); + LHS = OR->getOperand(0); + RHS = OR->getOperand(1); + // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). + if (RHS->getOpcode() == ISD::OR) + std::swap(LHS, RHS); + EVT VT = OR->getValueType(0); + SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); + if (!NewRHS) + return SDValue(); + Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS); + } + + if (Ret) + Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); + + return Ret; +} + static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -28505,18 +31082,23 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, unsigned Opc = X86ISD::SHLD; SDValue Op0 = N0.getOperand(0); SDValue Op1 = N1.getOperand(0); - if (ShAmt0.getOpcode() == ISD::SUB) { + if (ShAmt0.getOpcode() == ISD::SUB || + ShAmt0.getOpcode() == ISD::XOR) { Opc = X86ISD::SHRD; std::swap(Op0, Op1); std::swap(ShAmt0, ShAmt1); } + // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C ) + // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C ) + // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C ) + // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C ) unsigned Bits = VT.getSizeInBits(); if (ShAmt1.getOpcode() == ISD::SUB) { SDValue Sum = ShAmt1.getOperand(0); if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { SDValue ShAmt1Op1 = ShAmt1.getOperand(1); - if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) + if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE) ShAmt1Op1 = ShAmt1Op1.getOperand(0); if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) return DAG.getNode(Opc, DL, VT, @@ -28526,18 +31108,39 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, } } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); - if (ShAmt0C && - ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) + if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), N1.getOperand(0), DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + } else if (ShAmt1.getOpcode() == ISD::XOR) { + SDValue Mask = ShAmt1.getOperand(1); + if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) { + unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL); + SDValue ShAmt1Op0 = ShAmt1.getOperand(0); + if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) + ShAmt1Op0 = ShAmt1Op0.getOperand(0); + if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) { + if (Op1.getOpcode() == InnerShift && + isa<ConstantSDNode>(Op1.getOperand(1)) && + Op1.getConstantOperandVal(1) == 1) { + return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + } + // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). + if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD && + Op1.getOperand(0) == Op1.getOperand(1)) { + return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0), + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); + } + } + } } return SDValue(); } -// Generate NEG and CMOV for integer abs. +/// Generate NEG and CMOV for integer abs. static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -28553,21 +31156,19 @@ static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) { // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) // and change it to SUB and CMOV. if (VT.isInteger() && N->getOpcode() == ISD::XOR && - N0.getOpcode() == ISD::ADD && - N0.getOperand(1) == N1 && - N1.getOpcode() == ISD::SRA && - N1.getOperand(0) == N0.getOperand(0)) - if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) - if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { - // Generate SUB & CMOV. - SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), - DAG.getConstant(0, DL, VT), N0.getOperand(0)); - - SDValue Ops[] = { N0.getOperand(0), Neg, - DAG.getConstant(X86::COND_GE, DL, MVT::i8), - SDValue(Neg.getNode(), 1) }; - return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); - } + N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) { + auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); + if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { + // Generate SUB & CMOV. + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), + DAG.getConstant(0, DL, VT), N0.getOperand(0)); + SDValue Ops[] = {N0.getOperand(0), Neg, + DAG.getConstant(X86::COND_GE, DL, MVT::i8), + SDValue(Neg.getNode(), 1)}; + return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); + } + } return SDValue(); } @@ -28671,28 +31272,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } -static SDValue combineXor(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) - return Cmp; - - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) - return RV; - - if (Subtarget.hasCMov()) - if (SDValue RV = combineIntegerAbs(N, DAG)) - return RV; - - if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) - return FPLogic; - - return SDValue(); -} - /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. @@ -28717,7 +31296,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, if (!Subtarget.hasSSE2()) return SDValue(); - if (Subtarget.hasAVX512()) { + if (Subtarget.hasBWI()) { if (VT.getSizeInBits() > 512) return SDValue(); } else if (Subtarget.hasAVX2()) { @@ -28999,6 +31578,11 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); + + // TODO: Expanding load with constant mask may be optimized as well. + if (Mld->isExpandingLoad()) + return SDValue(); + if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) return ScalarLoad; @@ -29018,8 +31602,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, SDLoc dl(Mld); assert(LdVT != VT && "Cannot extend to the same type"); - unsigned ToSz = VT.getVectorElementType().getSizeInBits(); - unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); + unsigned ToSz = VT.getScalarSizeInBits(); + unsigned FromSz = LdVT.getScalarSizeInBits(); // From/To sizes and ElemCount must be pow of two. assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for extending masked load"); @@ -29114,6 +31698,10 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); + + if (Mst->isCompressingStore()) + return SDValue(); + if (!Mst->isTruncatingStore()) return reduceMaskedStoreToScalarStore(Mst, DAG); @@ -29124,8 +31712,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, SDLoc dl(Mst); assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromSz = VT.getVectorElementType().getSizeInBits(); - unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + unsigned FromSz = VT.getScalarSizeInBits(); + unsigned ToSz = StVT.getScalarSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -29253,8 +31841,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromSz = VT.getVectorElementType().getSizeInBits(); - unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + unsigned FromSz = VT.getScalarSizeInBits(); + unsigned ToSz = StVT.getScalarSizeInBits(); // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw @@ -29596,6 +32184,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify +/// the codegen. +/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) +static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDLoc &DL) { + assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"); + SDValue Src = N->getOperand(0); + unsigned Opcode = Src.getOpcode(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + EVT VT = N->getValueType(0); + EVT SrcVT = Src.getValueType(); + + auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) { + // TODO: Add extra cases where we can truncate both inputs for the + // cost of one (or none). + // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y ) + if (Op0 == Op1) + return true; + + SDValue BC0 = peekThroughOneUseBitcasts(Op0); + SDValue BC1 = peekThroughOneUseBitcasts(Op1); + return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) || + ISD::isBuildVectorOfConstantSDNodes(BC1.getNode()); + }; + + auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { + SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); + SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); + return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1); + }; + + // Don't combine if the operation has other uses. + if (!N->isOnlyUserOf(Src.getNode())) + return SDValue(); + + // Only support vector truncation for now. + // TODO: i64 scalar math would benefit as well. + if (!VT.isVector()) + return SDValue(); + + // In most cases its only worth pre-truncating if we're only facing the cost + // of one truncation. + // i.e. if one of the inputs will constant fold or the input is repeated. + switch (Opcode) { + case ISD::AND: + case ISD::XOR: + case ISD::OR: { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + if (TLI.isOperationLegalOrPromote(Opcode, VT) && + IsRepeatedOpOrOneUseConstant(Op0, Op1)) + return TruncateArithmetic(Op0, Op1); + break; + } + + case ISD::MUL: + // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its + // better to truncate if we have the chance. + if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) && + !TLI.isOperationLegal(Opcode, SrcVT)) + return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); + LLVM_FALLTHROUGH; + case ISD::ADD: { + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + if (TLI.isOperationLegal(Opcode, VT) && + IsRepeatedOpOrOneUseConstant(Op0, Op1)) + return TruncateArithmetic(Op0, Op1); + break; + } + } + + return SDValue(); +} + /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. static SDValue combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, @@ -29653,7 +32318,8 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. static SDValue -combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, +combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget, + SelectionDAG &DAG, SmallVector<SDValue, 8> &Regs) { assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); EVT OutVT = N->getValueType(0); @@ -29662,8 +32328,10 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, // Shift left by 16 bits, then arithmetic-shift right by 16 bits. SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); for (auto &Reg : Regs) { - Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); - Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, + Subtarget, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, + Subtarget, DAG); } for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) @@ -29681,7 +32349,7 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type /// legalization the truncation will be translated into a BUILD_VECTOR with each /// element that is extracted from a vector and then truncated, and it is -/// diffcult to do this optimization based on them. +/// difficult to do this optimization based on them. static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT OutVT = N->getValueType(0); @@ -29732,17 +32400,60 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasSSE41() || OutSVT == MVT::i8) return combineVectorTruncationWithPACKUS(N, DAG, SubVec); else if (InSVT == MVT::i32) - return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec); else return SDValue(); } +/// This function transforms vector truncation of 'all or none' bits values. +/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations. +static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Requires SSE2 but AVX512 has fast truncate. + if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) + return SDValue(); + + if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + MVT VT = N->getValueType(0).getSimpleVT(); + MVT SVT = VT.getScalarType(); + + MVT InVT = In.getValueType().getSimpleVT(); + MVT InSVT = InVT.getScalarType(); + + // Use PACKSS if the input is a splatted sign bit. + // e.g. Comparison result, sext_in_reg, etc. + unsigned NumSignBits = DAG.ComputeNumSignBits(In); + if (NumSignBits != InSVT.getSizeInBits()) + return SDValue(); + + // Check we have a truncation suited for PACKSS. + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32) + return SDValue(); + if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) + return SDValue(); + + return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget); +} + static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); + // Attempt to pre-truncate inputs to arithmetic ops instead. + if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL)) + return V; + // Try to detect AVG pattern first. if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; @@ -29755,15 +32466,75 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } + // Try to truncate extended sign bits with PACKSS. + if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) + return V; + return combineVectorTruncation(N, DAG, Subtarget); } +/// Returns the negated value if the node \p N flips sign of FP value. +/// +/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000). +/// AVX512F does not have FXOR, so FNEG is lowered as +/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))). +/// In this case we go though all bitcasts. +static SDValue isFNEG(SDNode *N) { + if (N->getOpcode() == ISD::FNEG) + return N->getOperand(0); + + SDValue Op = peekThroughBitcasts(SDValue(N, 0)); + if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR) + return SDValue(); + + SDValue Op1 = peekThroughBitcasts(Op.getOperand(1)); + if (!Op1.getValueType().isFloatingPoint()) + return SDValue(); + + SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); + + unsigned EltBits = Op1.getScalarValueSizeInBits(); + auto isSignBitValue = [&](const ConstantFP *C) { + return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits); + }; + + // There is more than one way to represent the same constant on + // the different X86 targets. The type of the node may also depend on size. + // - load scalar value and broadcast + // - BUILD_VECTOR node + // - load from a constant pool. + // We check all variants here. + if (Op1.getOpcode() == X86ISD::VBROADCAST) { + if (auto *C = getTargetConstantFromNode(Op1.getOperand(0))) + if (isSignBitValue(cast<ConstantFP>(C))) + return Op0; + + } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) { + if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode()) + if (isSignBitValue(CN->getConstantFPValue())) + return Op0; + + } else if (auto *C = getTargetConstantFromNode(Op1)) { + if (C->getType()->isVectorTy()) { + if (auto *SplatV = C->getSplatValue()) + if (isSignBitValue(cast<ConstantFP>(SplatV))) + return Op0; + } else if (auto *FPConst = dyn_cast<ConstantFP>(C)) + if (isSignBitValue(FPConst)) + return Op0; + } + return SDValue(); +} + /// Do target-specific dag combines on floating point negations. static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT VT = N->getValueType(0); + EVT OrigVT = N->getValueType(0); + SDValue Arg = isFNEG(N); + assert(Arg.getNode() && "N is expected to be an FNEG node"); + + EVT VT = Arg.getValueType(); EVT SVT = VT.getScalarType(); - SDValue Arg = N->getOperand(0); SDLoc DL(N); // Let legalize expand this if it isn't a legal type yet. @@ -29776,70 +32547,182 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); - return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), - Arg.getOperand(1), Zero); + SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Zero); + return DAG.getBitcast(OrigVT, NewNode); } - // If we're negating a FMA node, then we can adjust the + // If we're negating an FMA node, then we can adjust the // instruction to include the extra negation. + unsigned NewOpcode = 0; if (Arg.hasOneUse()) { switch (Arg.getOpcode()) { - case X86ISD::FMADD: - return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FMSUB: - return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FNMADD: - return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - case X86ISD::FNMSUB: - return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), - Arg.getOperand(1), Arg.getOperand(2)); - } - } + case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break; + case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; + // We can't handle scalar intrinsic node here because it would only + // invert one element and not the whole vector. But we could try to handle + // a negation of the lower element only. + } + } + if (NewOpcode) + return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, + Arg.getNode()->ops())); + return SDValue(); } static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - EVT VT = N->getValueType(0); - if (VT.is512BitVector() && !Subtarget.hasDQI()) { - // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. - // These logic operations may be executed in the integer domain. + const X86Subtarget &Subtarget) { + MVT VT = N->getSimpleValueType(0); + // If we have integer vector types available, use the integer opcodes. + if (VT.isVector() && Subtarget.hasSSE2()) { SDLoc dl(N); - MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); + + MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); - unsigned IntOpcode = 0; + unsigned IntOpcode; switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected FP logic op"); - case X86ISD::FOR: IntOpcode = ISD::OR; break; - case X86ISD::FXOR: IntOpcode = ISD::XOR; break; - case X86ISD::FAND: IntOpcode = ISD::AND; break; - case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; + default: llvm_unreachable("Unexpected FP logic op"); + case X86ISD::FOR: IntOpcode = ISD::OR; break; + case X86ISD::FXOR: IntOpcode = ISD::XOR; break; + case X86ISD::FAND: IntOpcode = ISD::AND; break; + case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; } SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); return DAG.getBitcast(VT, IntOp); } return SDValue(); } + +static SDValue combineXor(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) + return Cmp; + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) + return RV; + + if (Subtarget.hasCMov()) + if (SDValue RV = combineIntegerAbs(N, DAG)) + return RV; + + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + if (isFNEG(N)) + return combineFneg(N, DAG, Subtarget); + return SDValue(); +} + + +static bool isNullFPScalarOrVectorConst(SDValue V) { + return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode()); +} + +/// If a value is a scalar FP zero or a vector FP zero (potentially including +/// undefined elements), return a zero constant that may be used to fold away +/// that value. In the case of a vector, the returned constant will not contain +/// undefined elements even if the input parameter does. This makes it suitable +/// to be used as a replacement operand with operations (eg, bitwise-and) where +/// an undef should not propagate. +static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!isNullFPScalarOrVectorConst(V)) + return SDValue(); + + if (V.getValueType().isVector()) + return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V)); + + return V; +} + +static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // Vector types are handled in combineANDXORWithAllOnesIntoANDNP(). + if (!((VT == MVT::f32 && Subtarget.hasSSE1()) || + (VT == MVT::f64 && Subtarget.hasSSE2()))) + return SDValue(); + + auto isAllOnesConstantFP = [](SDValue V) { + auto *C = dyn_cast<ConstantFPSDNode>(V); + return C && C->getConstantFPValue()->isAllOnesValue(); + }; + + // fand (fxor X, -1), Y --> fandn X, Y + if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1))) + return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1); + + // fand X, (fxor Y, -1) --> fandn Y, X + if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1))) + return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0); + + return SDValue(); +} + +/// Do target-specific dag combines on X86ISD::FAND nodes. +static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // FAND(0.0, x) -> 0.0 + if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget)) + return V; + + // FAND(x, 0.0) -> 0.0 + if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) + return V; + + if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget)) + return V; + + return lowerX86FPLogicOp(N, DAG, Subtarget); +} + +/// Do target-specific dag combines on X86ISD::FANDN nodes. +static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // FANDN(0.0, x) -> x + if (isNullFPScalarOrVectorConst(N->getOperand(0))) + return N->getOperand(1); + + // FANDN(x, 0.0) -> 0.0 + if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) + return V; + + return lowerX86FPLogicOp(N, DAG, Subtarget); +} + /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) - if (C->getValueAPF().isPosZero()) - return N->getOperand(1); + if (isNullFPScalarOrVectorConst(N->getOperand(0))) + return N->getOperand(1); // F[X]OR(x, 0.0) -> x - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) - if (C->getValueAPF().isPosZero()) - return N->getOperand(0); + if (isNullFPScalarOrVectorConst(N->getOperand(1))) + return N->getOperand(0); + + if (isFNEG(N)) + if (SDValue NewVal = combineFneg(N, DAG, Subtarget)) + return NewVal; return lowerX86FPLogicOp(N, DAG, Subtarget); } @@ -29921,38 +32804,6 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); } -/// Do target-specific dag combines on X86ISD::FAND nodes. -static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - // FAND(0.0, x) -> 0.0 - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) - if (C->getValueAPF().isPosZero()) - return N->getOperand(0); - - // FAND(x, 0.0) -> 0.0 - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) - if (C->getValueAPF().isPosZero()) - return N->getOperand(1); - - return lowerX86FPLogicOp(N, DAG, Subtarget); -} - -/// Do target-specific dag combines on X86ISD::FANDN nodes -static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - // FANDN(0.0, x) -> x - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) - if (C->getValueAPF().isPosZero()) - return N->getOperand(1); - - // FANDN(x, 0.0) -> 0.0 - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) - if (C->getValueAPF().isPosZero()) - return N->getOperand(1); - - return lowerX86FPLogicOp(N, DAG, Subtarget); -} - static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // BT ignores high bits in the bit index operand. @@ -29971,17 +32822,6 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) { - SDValue Op = peekThroughBitcasts(N->getOperand(0)); - EVT VT = N->getValueType(0), OpVT = Op.getValueType(); - if (Op.getOpcode() == X86ISD::VZEXT_LOAD && - VT.getVectorElementType().getSizeInBits() == - OpVT.getVectorElementType().getSizeInBits()) { - return DAG.getBitcast(VT, Op); - } - return SDValue(); -} - static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); @@ -30018,19 +32858,32 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, } /// sext(add_nsw(x, C)) --> add(sext(x), C_sext) -/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities -/// to combine math ops, use an LEA, or use a complex addressing mode. This can -/// eliminate extend, add, and shift instructions. -static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +/// zext(add_nuw(x, C)) --> add(zext(x), C_zext) +/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes +/// opportunities to combine math ops, use an LEA, or use a complex addressing +/// mode. This can eliminate extend, add, and shift instructions. +static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (Ext->getOpcode() != ISD::SIGN_EXTEND && + Ext->getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + // TODO: This should be valid for other integer types. - EVT VT = Sext->getValueType(0); + EVT VT = Ext->getValueType(0); if (VT != MVT::i64) return SDValue(); - // We need an 'add nsw' feeding into the 'sext'. - SDValue Add = Sext->getOperand(0); - if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) + SDValue Add = Ext->getOperand(0); + if (Add.getOpcode() != ISD::ADD) + return SDValue(); + + bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND; + bool NSW = Add->getFlags()->hasNoSignedWrap(); + bool NUW = Add->getFlags()->hasNoUnsignedWrap(); + + // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding + // into the 'zext' + if ((Sext && !NSW) || (!Sext && !NUW)) return SDValue(); // Having a constant operand to the 'add' ensures that we are not increasing @@ -30046,7 +32899,7 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, // of single 'add' instructions, but the cost model for selecting an LEA // currently has a high threshold. bool HasLEAPotential = false; - for (auto *User : Sext->uses()) { + for (auto *User : Ext->uses()) { if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { HasLEAPotential = true; break; @@ -30055,17 +32908,18 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, if (!HasLEAPotential) return SDValue(); - // Everything looks good, so pull the 'sext' ahead of the 'add'. - int64_t AddConstant = AddOp1->getSExtValue(); + // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'. + int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue(); SDValue AddOp0 = Add.getOperand(0); - SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); + SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0); SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); // The wider add is guaranteed to not wrap because both operands are // sign-extended. SDNodeFlags Flags; - Flags.setNoSignedWrap(true); - return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); + Flags.setNoSignedWrap(NSW); + Flags.setNoUnsignedWrap(NUW); + return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags); } /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> @@ -30157,18 +33011,17 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. // Also use this if we don't have SSE41 to allow the legalizer do its job. if (!Subtarget.hasSSE41() || VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasInt256())) { + (VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.hasAVX512())) { SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); return Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(ExOp, DL, VT) : DAG.getZeroExtendVectorInReg(ExOp, DL, VT); } - // On pre-AVX2 targets, split into 128-bit nodes of - // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) { - unsigned NumVecs = VT.getSizeInBits() / 128; - unsigned NumSubElts = 128 / SVT.getSizeInBits(); + auto SplitAndExtendInReg = [&](unsigned SplitSize) { + unsigned NumVecs = VT.getSizeInBits() / SplitSize; + unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); @@ -30176,14 +33029,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, DAG.getIntPtrConstant(Offset, DL)); - SrcVec = ExtendVecSize(DL, SrcVec, 128); + SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); SrcVec = Opcode == ISD::SIGN_EXTEND ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT) : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT); Opnds.push_back(SrcVec); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); - } + }; + + // On pre-AVX2 targets, split into 128-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) + return SplitAndExtendInReg(128); + + // On pre-AVX512 targets, split into 256-bit nodes of + // ISD::*_EXTEND_VECTOR_INREG. + if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256)) + return SplitAndExtendInReg(256); return SDValue(); } @@ -30216,7 +33079,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) + if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; return SDValue(); @@ -30239,26 +33102,58 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); - bool NegA = (A.getOpcode() == ISD::FNEG); - bool NegB = (B.getOpcode() == ISD::FNEG); - bool NegC = (C.getOpcode() == ISD::FNEG); + auto invertIfNegative = [](SDValue &V) { + if (SDValue NegVal = isFNEG(V.getNode())) { + V = NegVal; + return true; + } + return false; + }; + + // Do not convert the passthru input of scalar intrinsics. + // FIXME: We could allow negations of the lower element only. + bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); + bool NegB = invertIfNegative(B); + bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); - if (NegA) - A = A.getOperand(0); - if (NegB) - B = B.getOperand(0); - if (NegC) - C = C.getOperand(0); - unsigned Opcode; + unsigned NewOpcode; if (!NegMul) - Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; + NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else - Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + + + if (N->getOpcode() == X86ISD::FMADD_RND) { + switch (NewOpcode) { + case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; + } + } else if (N->getOpcode() == X86ISD::FMADDS1_RND) { + switch (NewOpcode) { + case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break; + } + } else if (N->getOpcode() == X86ISD::FMADDS3_RND) { + switch (NewOpcode) { + case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break; + } + } else { + assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) && + "Unexpected opcode!"); + return DAG.getNode(NewOpcode, dl, VT, A, B, C); + } - return DAG.getNode(Opcode, dl, VT, A, B, C); + return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, @@ -30308,6 +33203,12 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue DivRem8 = getDivRem8(N, DAG)) return DivRem8; + if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) + return NewAdd; + + if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) + return R; + return SDValue(); } @@ -30443,10 +33344,8 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); // Try to simplify the EFLAGS and condition code operands. - if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { - SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); - return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); - } + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) + return getSETCC(CC, Flags, DL, DAG); return SDValue(); } @@ -30539,6 +33438,12 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } + // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't + // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform + // the optimization here. + if (DAG.SignBitIsZero(Op0)) + return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0); + return SDValue(); } @@ -30555,9 +33460,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, EVT InVT = Op0.getValueType(); EVT InSVT = InVT.getScalarType(); + // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) - if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { + if (InVT.isVector() && + (InSVT == MVT::i8 || InSVT == MVT::i16 || + (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) { SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); @@ -30565,6 +33473,23 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } + // Without AVX512DQ we only support i64 to float scalar conversion. For both + // vectors and scalars, see if we know that the upper bits are all the sign + // bit, in which case we can truncate the input to i32 and convert from that. + if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) { + unsigned BitWidth = InVT.getScalarSizeInBits(); + unsigned NumSignBits = DAG.ComputeNumSignBits(Op0); + if (NumSignBits >= (BitWidth - 31)) { + EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32); + if (InVT.isVector()) + TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, + InVT.getVectorNumElements()); + SDLoc dl(N); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); + } + } + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { @@ -30654,13 +33579,15 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); } -static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. if (!VT.isVector() || !VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) return SDValue(); @@ -30672,24 +33599,13 @@ static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG, RegSize = 256; // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // TODO: We should be able to handle larger vectors by splitting them before + // feeding them into several SADs, and then reducing over those. if (VT.getSizeInBits() / 4 > RegSize) return SDValue(); - // Detect the following pattern: - // - // 1: %2 = zext <N x i8> %0 to <N x i32> - // 2: %3 = zext <N x i8> %1 to <N x i32> - // 3: %4 = sub nsw <N x i32> %2, %3 - // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N] - // 5: %6 = sub nsw <N x i32> zeroinitializer, %4 - // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6 - // 7: %8 = add nsw <N x i32> %7, %vec.phi - // - // The last instruction must be a reduction add. The instructions 3-6 forms an - // ABSDIFF pattern. - - // The two operands of reduction add are from PHI and a select-op as in line 7 - // above. + // We know N is a reduction add, which means one of its operands is a phi. + // To match SAD, we need the other operand to be a vector select. SDValue SelectOp, Phi; if (Op0.getOpcode() == ISD::VSELECT) { SelectOp = Op0; @@ -30700,77 +33616,22 @@ static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG, } else return SDValue(); - // Check the condition of the select instruction is greater-than. - SDValue SetCC = SelectOp->getOperand(0); - if (SetCC.getOpcode() != ISD::SETCC) - return SDValue(); - ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT) - return SDValue(); - - Op0 = SelectOp->getOperand(1); - Op1 = SelectOp->getOperand(2); - - // The second operand of SelectOp Op1 is the negation of the first operand - // Op0, which is implemented as 0 - Op0. - if (!(Op1.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) && - Op1.getOperand(1) == Op0)) - return SDValue(); - - // The first operand of SetCC is the first operand of SelectOp, which is the - // difference between two input vectors. - if (SetCC.getOperand(0) != Op0) - return SDValue(); - - // The second operand of > comparison can be either -1 or 0. - if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || - ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) - return SDValue(); - - // The first operand of SelectOp is the difference between two input vectors. - if (Op0.getOpcode() != ISD::SUB) - return SDValue(); - - Op1 = Op0.getOperand(1); - Op0 = Op0.getOperand(0); - - // Check if the operands of the diff are zero-extended from vectors of i8. - if (Op0.getOpcode() != ISD::ZERO_EXTEND || - Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || - Op1.getOpcode() != ISD::ZERO_EXTEND || - Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) + // Check whether we have an abs-diff pattern feeding into the select. + if(!detectZextAbsDiff(SelectOp, Op0, Op1)) return SDValue(); // SAD pattern detected. Now build a SAD instruction and an addition for - // reduction. Note that the number of elments of the result of SAD is less + // reduction. Note that the number of elements of the result of SAD is less // than the number of elements of its input. Therefore, we could only update // part of elements in the reduction vector. - - // Legalize the type of the inputs of PSADBW. - EVT InVT = Op0.getOperand(0).getValueType(); - if (InVT.getSizeInBits() <= 128) - RegSize = 128; - else if (InVT.getSizeInBits() <= 256) - RegSize = 256; - - unsigned NumConcat = RegSize / InVT.getSizeInBits(); - SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT)); - Ops[0] = Op0.getOperand(0); - MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); - Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); - Ops[0] = Op1.getOperand(0); - Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + SDValue Sad = createPSADBW(DAG, Op0, Op1, DL); // The output of PSADBW is a vector of i64. - MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); - SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1); - // We need to turn the vector of i64 into a vector of i32. // If the reduction vector is at least as wide as the psadbw result, just // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero // anyway. - MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); if (VT.getSizeInBits() >= ResVT.getSizeInBits()) Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); else @@ -30793,7 +33654,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; if (Flags->hasVectorReduction()) { - if (SDValue Sad = detectSADPattern(N, DAG, Subtarget)) + if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) return Sad; } EVT VT = N->getValueType(0); @@ -30832,20 +33693,21 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, } } - // Try to synthesize horizontal adds from adds of shuffles. + // Try to synthesize horizontal subs from subs of shuffles. EVT VT = N->getValueType(0); if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && - isHorizontalBinOp(Op0, Op1, true)) + isHorizontalBinOp(Op0, Op1, false)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); return OptimizeConditionalInDecrement(N, DAG); } -static SDValue combineVZext(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); + unsigned Opcode = N->getOpcode(); MVT VT = N->getSimpleValueType(0); MVT SVT = VT.getVectorElementType(); SDValue Op = N->getOperand(0); @@ -30854,25 +33716,28 @@ static SDValue combineVZext(SDNode *N, SelectionDAG &DAG, unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); // Perform any constant folding. + // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled. if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - SmallVector<SDValue, 4> Vals; - for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + unsigned NumDstElts = VT.getVectorNumElements(); + SmallBitVector Undefs(NumDstElts, false); + SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0)); + for (unsigned i = 0; i != NumDstElts; ++i) { SDValue OpElt = Op.getOperand(i); if (OpElt.getOpcode() == ISD::UNDEF) { - Vals.push_back(DAG.getUNDEF(SVT)); + Undefs[i] = true; continue; } APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue(); - assert(Cst.getBitWidth() == OpEltVT.getSizeInBits()); - Cst = Cst.zextOrTrunc(SVT.getSizeInBits()); - Vals.push_back(DAG.getConstant(Cst, DL, SVT)); + Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits()) + : Cst.sextOrTrunc(SVT.getSizeInBits()); } - return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals); + return getConstVector(Vals, Undefs, VT, DAG, DL); } // (vzext (bitcast (vzext (x)) -> (vzext x) + // TODO: (vsext (bitcast (vsext (x)) -> (vsext x) SDValue V = peekThroughBitcasts(Op); - if (V != Op && V.getOpcode() == X86ISD::VZEXT) { + if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) { MVT InnerVT = V.getSimpleValueType(); MVT InnerEltVT = InnerVT.getVectorElementType(); @@ -30897,7 +33762,9 @@ static SDValue combineVZext(SDNode *N, SelectionDAG &DAG, // Check if we can bypass extracting and re-inserting an element of an input // vector. Essentially: // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) - if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + // TODO: Add X86ISD::VSEXT support + if (Opcode == X86ISD::VZEXT && + V.getOpcode() == ISD::SCALAR_TO_VECTOR && V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); @@ -30976,7 +33843,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; - case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI); + case ISD::EXTRACT_VECTOR_ELT: + return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); @@ -31002,16 +33870,15 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); + case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); + case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return combineFOr(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); - case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); - case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::BT: return combineBT(N, DAG, DCI); - case X86ISD::VZEXT_MOVL: return combineVZextMovl(N, DAG); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); @@ -31019,7 +33886,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget); - case X86ISD::VZEXT: return combineVZext(N, DAG, DCI, Subtarget); + case X86ISD::VSHLI: + case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget); + case X86ISD::VSEXT: + case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: case X86ISD::PALIGNR: @@ -31043,11 +33913,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: + case X86ISD::VPERMIV3: case X86ISD::VPERMIL2: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: + case X86ISD::VZEXT_MOVL: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); + case X86ISD::FMADD: + case X86ISD::FMADD_RND: + case X86ISD::FMADDS1_RND: + case X86ISD::FMADDS3_RND: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG); @@ -31133,7 +34009,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { case ISD::OR: case ISD::XOR: Commute = true; - // fallthrough + LLVM_FALLTHROUGH; case ISD::SUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); @@ -31280,9 +34156,11 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { case 'u': case 'y': case 'x': + case 'v': case 'Y': case 'l': return C_RegisterClass; + case 'k': // AVX512 masking registers. case 'a': case 'b': case 'c': @@ -31306,6 +34184,19 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { break; } } + else if (Constraint.size() == 2) { + switch (Constraint[0]) { + default: + break; + case 'Y': + switch (Constraint[1]) { + default: + break; + case 'k': + return C_Register; + } + } + } return TargetLowering::getConstraintType(Constraint); } @@ -31349,12 +34240,28 @@ TargetLowering::ConstraintWeight if (type->isX86_MMXTy() && Subtarget.hasMMX()) weight = CW_SpecificReg; break; - case 'x': case 'Y': + // Other "Y<x>" (e.g. "Yk") constraints should be implemented below. + if (constraint[1] == 'k') { + // Support for 'Yk' (similarly to the 'k' variant below). + weight = CW_SpecificReg; + break; + } + // Else fall through (handle "Y" constraint). + LLVM_FALLTHROUGH; + case 'v': + if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()) + weight = CW_Register; + LLVM_FALLTHROUGH; + case 'x': if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256())) weight = CW_Register; break; + case 'k': + // Enable conditional vector operations using %k<#> registers. + weight = CW_SpecificReg; + break; case 'I': if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { if (C->getZExtValue() <= 31) @@ -31601,60 +34508,21 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, /// Check if \p RC is a general purpose register class. /// I.e., GR* or one of their variant. static bool isGRClass(const TargetRegisterClass &RC) { - switch (RC.getID()) { - case X86::GR8RegClassID: - case X86::GR8_ABCD_LRegClassID: - case X86::GR8_ABCD_HRegClassID: - case X86::GR8_NOREXRegClassID: - case X86::GR16RegClassID: - case X86::GR16_ABCDRegClassID: - case X86::GR16_NOREXRegClassID: - case X86::GR32RegClassID: - case X86::GR32_ABCDRegClassID: - case X86::GR32_TCRegClassID: - case X86::GR32_NOREXRegClassID: - case X86::GR32_NOAXRegClassID: - case X86::GR32_NOSPRegClassID: - case X86::GR32_NOREX_NOSPRegClassID: - case X86::GR32_ADRegClassID: - case X86::GR64RegClassID: - case X86::GR64_ABCDRegClassID: - case X86::GR64_TCRegClassID: - case X86::GR64_TCW64RegClassID: - case X86::GR64_NOREXRegClassID: - case X86::GR64_NOSPRegClassID: - case X86::GR64_NOREX_NOSPRegClassID: - case X86::LOW32_ADDR_ACCESSRegClassID: - case X86::LOW32_ADDR_ACCESS_RBPRegClassID: - return true; - default: - return false; - } + return RC.hasSuperClassEq(&X86::GR8RegClass) || + RC.hasSuperClassEq(&X86::GR16RegClass) || + RC.hasSuperClassEq(&X86::GR32RegClass) || + RC.hasSuperClassEq(&X86::GR64RegClass) || + RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass); } /// Check if \p RC is a vector register class. /// I.e., FR* / VR* or one of their variant. static bool isFRClass(const TargetRegisterClass &RC) { - switch (RC.getID()) { - case X86::FR32RegClassID: - case X86::FR32XRegClassID: - case X86::FR64RegClassID: - case X86::FR64XRegClassID: - case X86::FR128RegClassID: - case X86::VR64RegClassID: - case X86::VR128RegClassID: - case X86::VR128LRegClassID: - case X86::VR128HRegClassID: - case X86::VR128XRegClassID: - case X86::VR256RegClassID: - case X86::VR256LRegClassID: - case X86::VR256HRegClassID: - case X86::VR256XRegClassID: - case X86::VR512RegClassID: - return true; - default: - return false; - } + return RC.hasSuperClassEq(&X86::FR32XRegClass) || + RC.hasSuperClassEq(&X86::FR64XRegClass) || + RC.hasSuperClassEq(&X86::VR128XRegClass) || + RC.hasSuperClassEq(&X86::VR256XRegClass) || + RC.hasSuperClassEq(&X86::VR512RegClass); } std::pair<unsigned, const TargetRegisterClass *> @@ -31670,6 +34538,24 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? + case 'k': + if (Subtarget.hasAVX512()) { + // Only supported in AVX512 or later. + switch (VT.SimpleTy) { + default: break; + case MVT::i32: + return std::make_pair(0U, &X86::VK32RegClass); + case MVT::i16: + return std::make_pair(0U, &X86::VK16RegClass); + case MVT::i8: + return std::make_pair(0U, &X86::VK8RegClass); + case MVT::i1: + return std::make_pair(0U, &X86::VK1RegClass); + case MVT::i64: + return std::make_pair(0U, &X86::VK64RegClass); + } + } + break; case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget.is64Bit()) { if (VT == MVT::i32 || VT == MVT::f32) @@ -31723,18 +34609,24 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::VR64RegClass); case 'Y': // SSE_REGS if SSE2 allowed if (!Subtarget.hasSSE2()) break; - // FALL THROUGH. + LLVM_FALLTHROUGH; + case 'v': case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget.hasSSE1()) break; + bool VConstraint = (Constraint[0] == 'v'); switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f32: case MVT::i32: + if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX()) + return std::make_pair(0U, &X86::FR32XRegClass); return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: case MVT::i64: + if (VConstraint && Subtarget.hasVLX()) + return std::make_pair(0U, &X86::FR64XRegClass); return std::make_pair(0U, &X86::FR64RegClass); // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. @@ -31744,6 +34636,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: + if (VConstraint && Subtarget.hasVLX()) + return std::make_pair(0U, &X86::VR128XRegClass); return std::make_pair(0U, &X86::VR128RegClass); // AVX types. case MVT::v32i8: @@ -31752,6 +34646,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: + if (VConstraint && Subtarget.hasVLX()) + return std::make_pair(0U, &X86::VR256XRegClass); return std::make_pair(0U, &X86::VR256RegClass); case MVT::v8f64: case MVT::v16f32: @@ -31761,6 +34657,29 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } break; } + } else if (Constraint.size() == 2 && Constraint[0] == 'Y') { + switch (Constraint[1]) { + default: + break; + case 'k': + // This register class doesn't allocate k0 for masked vector operation. + if (Subtarget.hasAVX512()) { // Only supported in AVX512. + switch (VT.SimpleTy) { + default: break; + case MVT::i32: + return std::make_pair(0U, &X86::VK32WMRegClass); + case MVT::i16: + return std::make_pair(0U, &X86::VK16WMRegClass); + case MVT::i8: + return std::make_pair(0U, &X86::VK8WMRegClass); + case MVT::i1: + return std::make_pair(0U, &X86::VK1WMRegClass); + case MVT::i64: + return std::make_pair(0U, &X86::VK64WMRegClass); + } + } + break; + } } // Use the default implementation in TargetLowering to convert the register @@ -31954,3 +34873,7 @@ void X86TargetLowering::insertCopiesSplitCSR( .addReg(NewVR); } } + +bool X86TargetLowering::supportSwiftError() const { + return Subtarget.is64Bit(); +} diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index d826f1e..37f9353 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -95,7 +95,7 @@ namespace llvm { SETCC, /// X86 Select - SELECT, + SELECT, SELECTS, // Same as SETCC except it's materialized with a sbb and the value is all // one's or all zero's. @@ -106,6 +106,10 @@ namespace llvm { /// 0s or 1s. Generally DTRT for C/C++ with NaNs. FSETCC, + /// X86 FP SETCC, similar to above, but with output as an i1 mask and + /// with optional rounding mode. + FSETCCM, FSETCCM_RND, + /// X86 conditional moves. Operand 0 and operand 1 are the two values /// to select from. Operand 2 is the condition code, and operand 3 is the /// flag operand produced by a CMP or TEST instruction. It also writes a @@ -135,8 +139,9 @@ namespace llvm { /// at function entry, used for PIC code. GlobalBaseReg, - /// A wrapper node for TargetConstantPool, - /// TargetExternalSymbol, and TargetGlobalAddress. + /// A wrapper node for TargetConstantPool, TargetJumpTable, + /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, + /// MCSymbol and TargetBlockAddress. Wrapper, /// Special wrapper used under X86-64 PIC mode for RIP @@ -205,12 +210,12 @@ namespace llvm { FDIV_RND, FMAX_RND, FMIN_RND, - FSQRT_RND, + FSQRT_RND, FSQRTS_RND, // FP vector get exponent. - FGETEXP_RND, + FGETEXP_RND, FGETEXPS_RND, // Extract Normalized Mantissas. - VGETMANT, + VGETMANT, VGETMANTS, // FP Scale. SCALEF, SCALEFS, @@ -251,7 +256,7 @@ namespace llvm { /// in order to obtain suitable precision. FRSQRT, FRCP, FRSQRTS, FRCPS, - + // Thread Local Storage. TLSADDR, @@ -293,13 +298,10 @@ namespace llvm { VTRUNCUS, VTRUNCS, // Vector FP extend. - VFPEXT, + VFPEXT, VFPEXT_RND, VFPEXTS_RND, // Vector FP round. - VFPROUND, - - // Vector signed/unsigned integer to double. - CVTDQ2PD, CVTUDQ2PD, + VFPROUND, VFPROUND_RND, VFPROUNDS_RND, // Convert a vector to mask, set bits base on MSB. CVT2MASK, @@ -426,9 +428,9 @@ namespace llvm { // Range Restriction Calculation For Packed Pairs of Float32/64 values. VRANGE, // Reduce - Perform Reduction Transformation on scalar\packed FP. - VREDUCE, + VREDUCE, VREDUCES, // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. - VRNDSCALE, + VRNDSCALE, VRNDSCALES, // Tests Types Of a FP Values for packed types. VFPCLASS, // Tests Types Of a FP Values for scalar types. @@ -486,19 +488,33 @@ namespace llvm { FMADDSUB_RND, FMSUBADD_RND, + // Scalar intrinsic FMA with rounding mode. + // Two versions, passthru bits on op1 or op3. + FMADDS1_RND, FMADDS3_RND, + FNMADDS1_RND, FNMADDS3_RND, + FMSUBS1_RND, FMSUBS3_RND, + FNMSUBS1_RND, FNMSUBS3_RND, + // Compress and expand. COMPRESS, EXPAND, - // Convert Unsigned/Integer to Scalar Floating-Point Value - // with rounding mode. - SINT_TO_FP_RND, - UINT_TO_FP_RND, + // Convert Unsigned/Integer to Floating-Point Value with rounding mode. + SINT_TO_FP_RND, UINT_TO_FP_RND, + SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND, // Vector float/double to signed/unsigned integer. - FP_TO_SINT_RND, FP_TO_UINT_RND, + CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND, // Scalar float/double to signed/unsigned integer. - SCALAR_FP_TO_SINT_RND, SCALAR_FP_TO_UINT_RND, + CVTS2SI_RND, CVTS2UI_RND, + + // Vector float/double to signed/unsigned integer with truncation. + CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND, + // Scalar float/double to signed/unsigned integer with truncation. + CVTTS2SI_RND, CVTTS2UI_RND, + + // Vector signed/unsigned integer to float/double. + CVTSI2P, CVTUI2P, // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. @@ -537,7 +553,10 @@ namespace llvm { XTEST, // ERI instructions. - RSQRT28, RCP28, EXP2, + RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2, + + // Conversions between float and half-float. + CVTPS2PH, CVTPH2PS, // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, @@ -587,7 +606,12 @@ namespace llvm { /// This instruction grabs the address of the next argument /// from a va_list. (reads and modifies the va_list in memory) - VAARG_64 + VAARG_64, + + // Vector truncating store with unsigned/signed saturation + VTRUNCSTOREUS, VTRUNCSTORES, + // Vector truncating masked store with unsigned/signed saturation + VMTRUNCSTOREUS, VMTRUNCSTORES // WARNING: Do not add anything in the end unless you want the node to // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all @@ -760,10 +784,28 @@ namespace llvm { bool isCheapToSpeculateCtlz() const override; + bool isCtlzFast() const override; + bool hasBitPreservingFPLogic(EVT VT) const override { return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); } + bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { + // If the pair to store is a mixture of float and int values, we will + // save two bitwise instructions and one float-to-int instruction and + // increase one store instruction. There is potentially a more + // significant benefit because it avoids the float->int domain switch + // for input value. So It is more likely a win. + if ((LTy.isFloatingPoint() && HTy.isInteger()) || + (LTy.isInteger() && HTy.isFloatingPoint())) + return true; + // If the pair only contains int values, we will save two bitwise + // instructions and increase one store instruction (costing one more + // store buffer). Since the benefit is more blurred so we leave + // such pair out until we get testcase to prove it is a win. + return false; + } + bool hasAndNotCompare(SDValue Y) const override; /// Return the value type to use for ISD::SETCC. @@ -995,10 +1037,16 @@ namespace llvm { bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; - bool supportSwiftError() const override { - return true; - } + bool supportSwiftError() const override; + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + + /// \brief Lower interleaved load(s) into target specific + /// instructions/intrinsics. + bool lowerInterleavedLoad(LoadInst *LI, + ArrayRef<ShuffleVectorInst *> Shuffles, + ArrayRef<unsigned> Indices, + unsigned Factor) const override; protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -1032,7 +1080,7 @@ namespace llvm { SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::InputArg> &ArgInfo, const SDLoc &dl, SelectionDAG &DAG, - const CCValAssign &VA, MachineFrameInfo *MFI, + const CCValAssign &VA, MachineFrameInfo &MFI, unsigned i) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, @@ -1073,8 +1121,9 @@ namespace llvm { SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + + unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl, @@ -1082,14 +1131,15 @@ namespace llvm { SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; @@ -1101,6 +1151,7 @@ namespace llvm { SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; @@ -1219,14 +1270,17 @@ namespace llvm { /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + /// Check if replacement of SQRT with RSQRT should be disabled. + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override; + /// Use rsqrt* to speed up sqrt calculations. - SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const override; + SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps, bool &UseOneConstNR, + bool Reciprocal) const override; /// Use rcp* to speed up fdiv calculations. - SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const override; + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps) const override; /// Reassociate floating point divisions into multiply by reciprocal. unsigned combineRepeatedFPDivisors() const override; @@ -1236,6 +1290,93 @@ namespace llvm { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } // end namespace X86 + + // Base class for all X86 non-masked store operations. + class X86StoreSDNode : public MemSDNode { + public: + X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl, + SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {} + const SDValue &getValue() const { return getOperand(1); } + const SDValue &getBasePtr() const { return getOperand(2); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::VTRUNCSTORES || + N->getOpcode() == X86ISD::VTRUNCSTOREUS; + } + }; + + // Base class for all X86 masked store operations. + // The class has the same order of operands as MaskedStoreSDNode for + // convenience. + class X86MaskedStoreSDNode : public MemSDNode { + public: + X86MaskedStoreSDNode(unsigned Opcode, unsigned Order, + const DebugLoc &dl, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {} + + const SDValue &getBasePtr() const { return getOperand(1); } + const SDValue &getMask() const { return getOperand(2); } + const SDValue &getValue() const { return getOperand(3); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::VMTRUNCSTORES || + N->getOpcode() == X86ISD::VMTRUNCSTOREUS; + } + }; + + // X86 Truncating Store with Signed saturation. + class TruncSStoreSDNode : public X86StoreSDNode { + public: + TruncSStoreSDNode(unsigned Order, const DebugLoc &dl, + SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {} + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::VTRUNCSTORES; + } + }; + + // X86 Truncating Store with Unsigned saturation. + class TruncUSStoreSDNode : public X86StoreSDNode { + public: + TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl, + SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) + : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {} + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::VTRUNCSTOREUS; + } + }; + + // X86 Truncating Masked Store with Signed saturation. + class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode { + public: + MaskedTruncSStoreSDNode(unsigned Order, + const DebugLoc &dl, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {} + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::VMTRUNCSTORES; + } + }; + + // X86 Truncating Masked Store with Unsigned saturation. + class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode { + public: + MaskedTruncUSStoreSDNode(unsigned Order, + const DebugLoc &dl, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {} + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::VMTRUNCSTOREUS; + } + }; + } // end namespace llvm #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index 803a7e3..230d170 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -77,15 +77,15 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, !if (!eq (TypeVariantName, "i"), !if (!eq (Size, 128), "v2i64", !if (!eq (Size, 256), "v4i64", - VTName)), VTName)); + !if (!eq (Size, 512), "v8i64", + VTName))), VTName)); PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # - !if (!eq (TypeVariantName, "i"), - !if (!eq (Size, 128), "v2i64", - !if (!eq (Size, 256), "v4i64", - !if (!eq (Size, 512), - !if (!eq (EltSize, 64), "v8i64", "v16i32"), - VTName))), VTName)); + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + !if (!eq (Size, 512), "v8i64", + VTName))), VTName)); PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); @@ -122,6 +122,10 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X); + // A vector tye of the same width with element type i64. This is used to + // create patterns for logic ops. + ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64"); + // A vector type of the same width with element type i32. This is used to // create the canonical constant zero node ImmAllZerosV. ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); @@ -194,7 +198,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, list<dag> ZeroMaskingPattern, string MaskingConstraint = "", InstrItinClass itin = NoItinerary, - bit IsCommutable = 0> { + bit IsCommutable = 0, + bit IsKCommutable = 0> { let isCommutable = IsCommutable in def NAME: AVX512<O, F, Outs, Ins, OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# @@ -202,7 +207,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, Pattern, itin>; // Prefer over VMOV*rrk Pat<> - let AddedComplexity = 20 in + let AddedComplexity = 20, isCommutable = IsKCommutable in def NAME#k: AVX512<O, F, Outs, MaskingIns, OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# "$dst {${mask}}, "#IntelSrcAsm#"}", @@ -210,8 +215,11 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, EVEX_K { // In case of the 3src subclass this is overridden with a let. string Constraints = MaskingConstraint; - } - let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + } + + // Zero mask does not add any restrictions to commute operands transformation. + // So, it is Ok to use IsCommutable instead of IsKCommutable. + let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns, OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"# "$dst {${mask}} {z}, "#IntelSrcAsm#"}", @@ -231,14 +239,16 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, SDNode Select = vselect, string MaskingConstraint = "", InstrItinClass itin = NoItinerary, - bit IsCommutable = 0> : + bit IsCommutable = 0, + bit IsKCommutable = 0> : AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, AttSrcAsm, IntelSrcAsm, [(set _.RC:$dst, RHS)], [(set _.RC:$dst, MaskingRHS)], [(set _.RC:$dst, (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], - MaskingConstraint, NoItinerary, IsCommutable>; + MaskingConstraint, NoItinerary, IsCommutable, + IsKCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -248,13 +258,14 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, string AttSrcAsm, string IntelSrcAsm, dag RHS, InstrItinClass itin = NoItinerary, - bit IsCommutable = 0, SDNode Select = vselect> : + bit IsCommutable = 0, bit IsKCommutable = 0, + SDNode Select = vselect> : AVX512_maskable_common<O, F, _, Outs, Ins, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select, - "$src0 = $dst", itin, IsCommutable>; + "$src0 = $dst", itin, IsCommutable, IsKCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the scalar instruction. @@ -278,41 +289,29 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS> : + dag RHS, bit IsCommutable = 0, + bit IsKCommutable = 0> : AVX512_maskable_common<O, F, _, Outs, !con((ins _.RC:$src1), NonTiedIns), !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>; - -// Similar to AVX512_maskable_3rc but in this case the input VT for the tied -// operand differs from the output VT. This requires a bitconvert on -// the preserved vector going into the vselect. -multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT, - X86VectorVTInfo InVT, - dag Outs, dag NonTiedIns, string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm, - dag RHS> : - AVX512_maskable_common<O, F, OutVT, Outs, - !con((ins InVT.RC:$src1), NonTiedIns), - !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), - !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), - OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (vselect InVT.KRCWM:$mask, RHS, - (bitconvert InVT.RC:$src1))>; + (vselect _.KRCWM:$mask, RHS, _.RC:$src1), + vselect, "", NoItinerary, IsCommutable, IsKCommutable>; multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS> : + dag RHS, bit IsCommutable = 0, + bit IsKCommutable = 0> : AVX512_maskable_common<O, F, _, Outs, !con((ins _.RC:$src1), NonTiedIns), !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (X86selects _.KRCWM:$mask, RHS, _.RC:$src1), - X86selects>; + X86selects, "", NoItinerary, IsCommutable, + IsKCommutable>; multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, @@ -334,7 +333,9 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, list<dag> Pattern, - list<dag> MaskingPattern> { + list<dag> MaskingPattern, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in def NAME: AVX512<O, F, Outs, Ins, OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# "$dst, "#IntelSrcAsm#"}", @@ -351,20 +352,21 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, dag Ins, dag MaskingIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, dag MaskingRHS> : + dag RHS, dag MaskingRHS, + bit IsCommutable = 0> : AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr, AttSrcAsm, IntelSrcAsm, [(set _.KRC:$dst, RHS)], - [(set _.KRC:$dst, MaskingRHS)]>; + [(set _.KRC:$dst, MaskingRHS)], IsCommutable>; multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS> : + dag RHS, bit IsCommutable = 0> : AVX512_maskable_common_cmp<O, F, _, Outs, Ins, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (and _.KRCWM:$mask, RHS)>; + (and _.KRCWM:$mask, RHS), IsCommutable>; multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, @@ -373,6 +375,27 @@ multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, [],[]>; +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskedRHS, + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0, SDNode Select = vselect> : + AVX512_maskable_custom<O, F, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, MaskedRHS, + _.ImmAllZerosV))], + "$src0 = $dst", itin, IsCommutable>; + // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion. def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; @@ -420,6 +443,22 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllOnesV))]>; } +// Alias instructions that allow VPTERNLOG to be used with a mask to create +// a mix of all ones and all zeros elements. This is done this way to force +// the same register to be used as input for all three sources. +let isPseudo = 1, Predicates = [HasAVX512] in { +def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst), + (ins VK16WM:$mask), "", + [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask), + (v16i32 immAllOnesV), + (v16i32 immAllZerosV)))]>; +def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst), + (ins VK8WM:$mask), "", + [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask), + (bc_v8i64 (v16i32 immAllOnesV)), + (bc_v8i64 (v16i32 immAllZerosV))))]>; +} + let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in { def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "", @@ -428,6 +467,16 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", [(set VR256X:$dst, (v8i32 immAllZerosV))]>; } +// Alias instructions that map fld0 to xorps for sse or vxorps for avx. +// This is expanded by ExpandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in { + def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", + [(set FR32X:$dst, fp32imm0)]>; + def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", + [(set FR64X:$dst, fpimm0)]>; +} + //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // @@ -548,25 +597,28 @@ defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; // vinsertps - insert f32 to XMM -def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), +let ExeDomain = SSEPackedSingle in { +def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, EVEX_4V; -def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), +def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>; +} //===----------------------------------------------------------------------===// // AVX-512 VECTOR EXTRACT //--- multiclass vextract_for_size<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - PatFrag vextract_extract> { + X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to @@ -597,32 +649,23 @@ multiclass vextract_for_size<int Opcode, []>, EVEX_K, EVEX; } - // Intrinsic call with masking. - def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x" # To.NumElts # "_" # From.Size) - From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask), + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (vextract_extract:$ext (From.VT From.RC:$src1), + (iPTR imm)), + To.RC:$src0)), (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # From.ZSuffix # "rrk") - To.RC:$src0, - (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), - From.RC:$src1, imm:$idx)>; - - // Intrinsic call with zero-masking. - def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x" # To.NumElts # "_" # From.Size) - From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # - From.ZSuffix # "rrkz") - (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), - From.RC:$src1, imm:$idx)>; + To.RC:$src0, To.KRCWM:$mask, From.RC:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext))>; - // Intrinsic call without masking. - def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x" # To.NumElts # "_" # From.Size) - From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (vextract_extract:$ext (From.VT From.RC:$src1), + (iPTR imm)), + To.ImmAllZerosV)), (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # - From.ZSuffix # "rr") - From.RC:$src1, imm:$idx)>; + From.ZSuffix # "rrkz") + To.KRCWM:$mask, From.RC:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext))>; } // Codegen pattern for the alternative types @@ -642,39 +685,45 @@ multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From, } multiclass vextract_for_type<ValueType EltVT32, int Opcode128, - ValueType EltVT64, int Opcode256> { + ValueType EltVT64, int Opcode256> { defm NAME # "32x4Z" : vextract_for_size<Opcode128, X86VectorVTInfo<16, EltVT32, VR512>, X86VectorVTInfo< 4, EltVT32, VR128X>, - vextract128_extract>, + vextract128_extract, + EXTRACT_get_vextract128_imm>, EVEX_V512, EVEX_CD8<32, CD8VT4>; defm NAME # "64x4Z" : vextract_for_size<Opcode256, X86VectorVTInfo< 8, EltVT64, VR512>, X86VectorVTInfo< 4, EltVT64, VR256X>, - vextract256_extract>, + vextract256_extract, + EXTRACT_get_vextract256_imm>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; let Predicates = [HasVLX] in defm NAME # "32x4Z256" : vextract_for_size<Opcode128, X86VectorVTInfo< 8, EltVT32, VR256X>, X86VectorVTInfo< 4, EltVT32, VR128X>, - vextract128_extract>, + vextract128_extract, + EXTRACT_get_vextract128_imm>, EVEX_V256, EVEX_CD8<32, CD8VT4>; let Predicates = [HasVLX, HasDQI] in defm NAME # "64x2Z256" : vextract_for_size<Opcode128, X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract>, + vextract128_extract, + EXTRACT_get_vextract128_imm>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; let Predicates = [HasDQI] in { defm NAME # "64x2Z" : vextract_for_size<Opcode128, X86VectorVTInfo< 8, EltVT64, VR512>, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract>, + vextract128_extract, + EXTRACT_get_vextract128_imm>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; defm NAME # "32x8Z" : vextract_for_size<Opcode256, X86VectorVTInfo<16, EltVT32, VR512>, X86VectorVTInfo< 8, EltVT32, VR256X>, - vextract256_extract>, + vextract256_extract, + EXTRACT_get_vextract256_imm>, EVEX_V512, EVEX_CD8<32, CD8VT8>; } } @@ -986,6 +1035,25 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, AVX5128IBase, EVEX; } +let Predicates = [HasVLX, HasBWI] in { + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWZ256m addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWZ256m addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 BROADCAST SUBVECTORS +// + defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", v16i32_info, v4i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT4>; @@ -999,6 +1067,79 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", v8f64_info, v4f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; +let Predicates = [HasAVX512] in { +def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v4f64 VR256X:$src), 1)>; +def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v4i64 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; +def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v16i16 VR256X:$src), 1)>; +def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v32i8 VR256X:$src), 1)>; + +def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF64x4Zrr + (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI64x4Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; + +def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))), + (VINSERTI64x4Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), + (VINSERTI64x4Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +} + let Predicates = [HasVLX] in { defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", v8i32x_info, v4i32x_info>, @@ -1006,7 +1147,28 @@ defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", v8f32x_info, v4f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; + +def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4Z256rm addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4Z256rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v4f32 VR128X:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v4i32 VR128X:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v8i16 VR128X:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v16i8 VR128X:$src), 1)>; } + let Predicates = [HasVLX, HasDQI] in { defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", v4i64x_info, v2i64x_info>, VEX_W, @@ -1014,7 +1176,73 @@ defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", v4f64x_info, v2f64x_info>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2f64 VR128X:$src), 1)>; +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2i64 VR128X:$src), 1)>; } + +let Predicates = [HasVLX, NoDQI] in { +def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4Z256rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2f64 VR128X:$src), 1)>; +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2i64 VR128X:$src), 1)>; +} + +let Predicates = [HasAVX512, NoDQI] in { +def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4rm addr:$src)>; + +def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VINSERTF64x4Zrr + (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VINSERTI64x4Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; + +def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; +} + let Predicates = [HasDQI] in { defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", v8i64_info, v2i64x_info>, VEX_W, @@ -1028,6 +1256,34 @@ defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; + +def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VINSERTF32x8Zrr + (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VINSERTI32x8Zrr + (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1), + (EXTRACT_SUBREG + (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + VR128X:$src, sub_xmm), + VR128X:$src, 1)), sub_ymm), 1)>; } multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, @@ -1049,10 +1305,10 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, EVEX_V128; } -defm VPBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", - avx512vl_i32_info, avx512vl_i64_info>; -defm VPBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", - avx512vl_f32_info, avx512vl_f64_info>; +defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", + avx512vl_i32_info, avx512vl_i64_info>; +defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", + avx512vl_f32_info, avx512vl_f64_info>; def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; @@ -1091,112 +1347,105 @@ defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", //===----------------------------------------------------------------------===// // -- VPERMI2 - 3 source operands form -- -multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, - X86VectorVTInfo _, X86VectorVTInfo IdxVT> { -let Constraints = "$src1 = $dst" in { - defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst), +multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { +let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + // The index operand in the pattern should really be an integer type. However, + // if we do that and it happens to come from a bitcast, then it becomes + // difficult to find the bitcast needed to convert the index to the + // destination type for the passthru since it will be folded with the bitcast + // of the index operand. + defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V, AVX5128IBase; - defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, - (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, + (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>, EVEX_4V, AVX5128IBase; } } multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, - X86VectorVTInfo _, X86VectorVTInfo IdxVT> { - let Constraints = "$src1 = $dst" in - defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + X86VectorVTInfo _> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in + defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), - (_.VT (X86VPermi2X IdxVT.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, - AVX5128IBase, EVEX_4V, EVEX_B; + (_.VT (X86VPermi2X _.RC:$src1, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), + 1>, AVX5128IBase, EVEX_4V, EVEX_B; } multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo VTInfo, - AVX512VLVectorVTInfo ShuffleMask> { - defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, - ShuffleMask.info512>, - avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512, - ShuffleMask.info512>, EVEX_V512; + AVX512VLVectorVTInfo VTInfo> { + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, - ShuffleMask.info128>, - avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128, - ShuffleMask.info128>, EVEX_V128; - defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, - ShuffleMask.info256>, - avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256, - ShuffleMask.info256>, EVEX_V256; + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256>, EVEX_V256; } } multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo VTInfo, - AVX512VLVectorVTInfo Idx, Predicate Prd> { let Predicates = [Prd] in - defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, - Idx.info512>, EVEX_V512; + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, EVEX_V512; let Predicates = [Prd, HasVLX] in { - defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, - Idx.info128>, EVEX_V128; - defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, - Idx.info256>, EVEX_V256; + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>, EVEX_V256; } } defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", - avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", - avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", - avx512vl_i16_info, avx512vl_i16_info, HasBWI>, + avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", - avx512vl_i8_info, avx512vl_i8_info, HasVBMI>, + avx512vl_i8_info, HasVBMI>, EVEX_CD8<8, CD8VF>; defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", - avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", - avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; // VPERMT2 multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo IdxVT> { -let Constraints = "$src1 = $dst" in { +let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins IdxVT.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V, - AVX5128IBase; + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>, + EVEX_4V, AVX5128IBase; defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins IdxVT.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, - (bitconvert (_.LdFrag addr:$src3))))>, + (bitconvert (_.LdFrag addr:$src3)))), 1>, EVEX_4V, AVX5128IBase; } } multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo IdxVT> { - let Constraints = "$src1 = $dst" in + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src1, - IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, - AVX5128IBase, EVEX_4V, EVEX_B; + IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), + 1>, AVX5128IBase, EVEX_4V, EVEX_B; } multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, @@ -1252,8 +1501,7 @@ defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", // AVX-512 - BLEND using mask // multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { - let hasSideEffects = 0 in + let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, @@ -1263,16 +1511,13 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - [(set _.RC:$dst, (vselect _.KRCWM:$mask, - (_.VT _.RC:$src2), - (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K; - let hasSideEffects = 0 in + []>, EVEX_4V, EVEX_K; def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), []>, EVEX_4V, EVEX_KZ; - let mayLoad = 1, hasSideEffects = 0 in + let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, @@ -1282,38 +1527,32 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - [(set _.RC:$dst, (vselect _.KRCWM:$mask, - (_.VT (bitconvert (_.LdFrag addr:$src2))), - (_.VT _.RC:$src1)))]>, - EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; - let mayLoad = 1, hasSideEffects = 0 in + []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; } + } } multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + let mayLoad = 1, hasSideEffects = 0 in { def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), - [(set _.RC:$dst,(vselect _.KRCWM:$mask, - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1)))]>, - EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + []>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; - let mayLoad = 1, hasSideEffects = 0 in def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; - + } } multiclass blendmask_dq <bits<8> opc, string OpcodeStr, @@ -1349,21 +1588,6 @@ defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; -let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), - (v8f32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), - (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), - (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - -def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), - (v8i32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; -} //===----------------------------------------------------------------------===// // Compare Instructions //===----------------------------------------------------------------------===// @@ -1421,6 +1645,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> }// let isAsmParserOnly = 1, hasSideEffects = 0 let isCodeGenOnly = 1 in { + let isCommutable = 1 in def rr : AVX512Ii8<0xC2, MRMSrcReg, (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", _.Suffix, @@ -1449,7 +1674,8 @@ let Predicates = [HasAVX512] in { } multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _, bit IsCommutable> { + let isCommutable = IsCommutable in def rr : AVX512BI<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -1480,8 +1706,8 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> : - avx512_icmp_packed<opc, OpcodeStr, OpNode, _> { + X86VectorVTInfo _, bit IsCommutable> : + avx512_icmp_packed<opc, OpcodeStr, OpNode, _, IsCommutable> { def rmb : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", @@ -1503,48 +1729,49 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + AVX512VLVectorVTInfo VTInfo, Predicate prd, + bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512>, - EVEX_V512; + defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512, + IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256>, - EVEX_V256; - defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128, + IsCommutable>, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo VTInfo, - Predicate prd> { + Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>, - EVEX_V512; + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, + IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>, - EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, + IsCommutable>, EVEX_V128; } } defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, - avx512vl_i8_info, HasBWI>, + avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>; defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, - avx512vl_i16_info, HasBWI>, + avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>; defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, - avx512vl_i32_info, HasAVX512>, + avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, - avx512vl_i64_info, HasAVX512>, + avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, @@ -1563,18 +1790,21 @@ defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (COPY_TO_REGCLASS (VPCMPGTDZrr - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (COPY_TO_REGCLASS (VPCMPEQDZrr - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; +} multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, X86VectorVTInfo _> { + let isCommutable = 1 in def rri : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, @@ -1740,7 +1970,7 @@ multiclass avx512_vcmp_common<X86VectorVTInfo _> { "$src2, $src1", "$src1, $src2", (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>; + imm:$cc), 1>; defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), @@ -1824,18 +2054,18 @@ defm VCMPPS : avx512_vcmp<avx512vl_f32_info>, def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)), (COPY_TO_REGCLASS (VCMPPSZrri - (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), - (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), (COPY_TO_REGCLASS (VPCMPDZrri - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), (COPY_TO_REGCLASS (VPCMPUDZrri - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; // ---------------------------------------------------------------- @@ -2011,34 +2241,38 @@ let Predicates = [HasBWI] in { } // GR from/to mask register -let Predicates = [HasDQI] in { - def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), - (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>; - def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), - (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>; - def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), - (KMOVBrk VK8:$src)>; - def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), - (KMOVBrk VK8:$src)>; -} -let Predicates = [HasAVX512] in { - def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), - (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>; - def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), - (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>; - def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))), - (KMOVWrk VK16:$src)>; - def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), - (KMOVWrk VK16:$src)>; -} -let Predicates = [HasBWI] in { - def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>; - def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), (KMOVDrk VK32:$src)>; -} -let Predicates = [HasBWI] in { - def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), (KMOVQkr GR64:$src)>; - def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), (KMOVQrk VK64:$src)>; -} +def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), + (COPY_TO_REGCLASS GR16:$src, VK16)>; +def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), + (COPY_TO_REGCLASS VK16:$src, GR16)>; + +def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (COPY_TO_REGCLASS GR8:$src, VK8)>; +def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (COPY_TO_REGCLASS VK8:$src, GR8)>; + +def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))), + (KMOVWrk VK16:$src)>; +def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), + (i32 (INSERT_SUBREG (IMPLICIT_DEF), + (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>; + +def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), + (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>; +def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), + (KMOVBrk VK8:$src)>, Requires<[HasDQI]>; +def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), + (i32 (INSERT_SUBREG (IMPLICIT_DEF), + (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>; + +def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), + (COPY_TO_REGCLASS GR32:$src, VK32)>; +def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), + (COPY_TO_REGCLASS VK32:$src, GR32)>; +def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), + (COPY_TO_REGCLASS GR64:$src, VK64)>; +def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), + (COPY_TO_REGCLASS VK64:$src, GR64)>; // Load/store kreg let Predicates = [HasDQI] in { @@ -2104,65 +2338,58 @@ let Predicates = [HasBWI] in { (KMOVQkm addr:$src)>; } -def assertzext_i1 : PatFrag<(ops node:$src), (assertzext node:$src), [{ - return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1; -}]>; - let Predicates = [HasAVX512] in { def : Pat<(i1 (trunc (i64 GR64:$src))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND64ri8 $src, (i64 1)), - sub_16bit)), VK1)>; - - def : Pat<(i1 (trunc (i64 (assertzext_i1 GR64:$src)))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>; + (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), + (i32 1))), VK1)>; def : Pat<(i1 (trunc (i32 GR32:$src))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND32ri8 $src, (i32 1)), - sub_16bit)), VK1)>; + (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>; def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>; + (COPY_TO_REGCLASS GR32:$src, VK1)>; def : Pat<(i1 (trunc (i8 GR8:$src))), - (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), (AND8ri $src, (i8 1)), - sub_8bit)), VK1)>; - - def : Pat<(i1 (trunc (i8 (assertzext_i1 GR8:$src)))), - (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>; + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), + VK1)>; def : Pat<(i1 (trunc (i16 GR16:$src))), - (COPY_TO_REGCLASS (AND16ri GR16:$src, (i16 1)), VK1)>; - - def : Pat<(i1 (trunc (i16 (assertzext_i1 GR16:$src)))), - (COPY_TO_REGCLASS $src, VK1)>; + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR16:$src, sub_16bit), (i32 1))), + VK1)>; def : Pat<(i32 (zext VK1:$src)), - (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; + (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; def : Pat<(i32 (anyext VK1:$src)), - (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; + (COPY_TO_REGCLASS VK1:$src, GR32)>; def : Pat<(i8 (zext VK1:$src)), - (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS VK1:$src, GR16)), sub_8bit))>; + (EXTRACT_SUBREG + (AND32ri8 (KMOVWrk + (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>; def : Pat<(i8 (anyext VK1:$src)), - (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS $src, GR16)), sub_8bit))>; + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>; def : Pat<(i64 (zext VK1:$src)), - (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; + (AND64ri8 (SUBREG_TO_REG (i64 0), + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>; def : Pat<(i64 (anyext VK1:$src)), - (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>; def : Pat<(i16 (zext VK1:$src)), - (COPY_TO_REGCLASS $src, GR16)>; + (EXTRACT_SUBREG + (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), + sub_16bit)>; def : Pat<(i16 (anyext VK1:$src)), - (i16 (COPY_TO_REGCLASS $src, GR16))>; + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>; } def : Pat<(v16i1 (scalar_to_vector VK1:$src)), (COPY_TO_REGCLASS VK1:$src, VK16)>; @@ -2181,34 +2408,12 @@ def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; -// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. -let Predicates = [HasAVX512, NoDQI] in { - // GR from/to 8-bit mask without native support - def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), - (COPY_TO_REGCLASS - (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), VK8)>; - def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), - (EXTRACT_SUBREG - (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), - sub_8bit)>; - def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), - (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16))>; - def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), - (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16))>; -} - -let Predicates = [HasAVX512] in { - def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), - (COPY_TO_REGCLASS VK16:$src, VK1)>; - def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), - (COPY_TO_REGCLASS VK8:$src, VK1)>; -} -let Predicates = [HasBWI] in { - def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), - (COPY_TO_REGCLASS VK32:$src, VK1)>; - def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), - (COPY_TO_REGCLASS VK64:$src, VK1)>; -} +def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>; +def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>; +def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>; +def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>; +def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>; +def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>; // Mask unary operation // - KNOT @@ -2233,7 +2438,7 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr, HasBWI>, VEX, PS, VEX_W; } -defm KNOT : avx512_mask_unop_all<0x44, "knot", not>; +defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>; multiclass avx512_mask_unop_int<string IntName, string InstName> { let Predicates = [HasAVX512] in @@ -2244,27 +2449,15 @@ multiclass avx512_mask_unop_int<string IntName, string InstName> { } defm : avx512_mask_unop_int<"knot", "KNOT">; -let Predicates = [HasDQI] in -def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (KNOTBrr VK8:$src1)>; -let Predicates = [HasAVX512] in -def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>; -let Predicates = [HasBWI] in -def : Pat<(xor VK32:$src1, (v32i1 immAllOnesV)), (KNOTDrr VK32:$src1)>; -let Predicates = [HasBWI] in -def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>; - // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit -let Predicates = [HasAVX512, NoDQI] in { -def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), - (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; -def : Pat<(not VK8:$src), - (COPY_TO_REGCLASS - (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; -} -def : Pat<(xor VK4:$src1, (v4i1 immAllOnesV)), - (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src1, VK16)), VK4)>; -def : Pat<(xor VK2:$src1, (v2i1 immAllOnesV)), - (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src1, VK16)), VK2)>; +let Predicates = [HasAVX512, NoDQI] in +def : Pat<(vnot VK8:$src), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; + +def : Pat<(vnot VK4:$src), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>; +def : Pat<(vnot VK2:$src), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>; // Mask binary operation // - KAND, KANDN, KOR, KXNOR, KXOR @@ -2293,13 +2486,16 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; +// These nodes use 'vnot' instead of 'not' to support vectors. +def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>; +def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>; -defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>; -defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; -defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>; -defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; -defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>; -defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; +defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>; +defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; +defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, 1>; +defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; +defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>; +defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; multiclass avx512_mask_binop_int<string IntName, string InstName> { let Predicates = [HasAVX512] in @@ -2316,11 +2512,12 @@ defm : avx512_mask_binop_int<"kor", "KOR">; defm : avx512_mask_binop_int<"kxnor", "KXNOR">; defm : avx512_mask_binop_int<"kxor", "KXOR">; -multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> { +multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode, + Instruction Inst> { // With AVX512F, 8-bit mask is promoted to 16-bit mask, // for the DQI set, this type is legal and KxxxB instruction is used let Predicates = [NoDQI] in - def : Pat<(OpNode VK8:$src1, VK8:$src2), + def : Pat<(VOpNode VK8:$src1, VK8:$src2), (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; @@ -2330,47 +2527,21 @@ multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> { (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS VK1:$src1, VK16), (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; - def : Pat<(OpNode VK2:$src1, VK2:$src2), + def : Pat<(VOpNode VK2:$src1, VK2:$src2), (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS VK2:$src1, VK16), (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>; - def : Pat<(OpNode VK4:$src1, VK4:$src2), + def : Pat<(VOpNode VK4:$src1, VK4:$src2), (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS VK4:$src1, VK16), (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>; } -defm : avx512_binop_pat<and, KANDWrr>; -defm : avx512_binop_pat<andn, KANDNWrr>; -defm : avx512_binop_pat<or, KORWrr>; -defm : avx512_binop_pat<xnor, KXNORWrr>; -defm : avx512_binop_pat<xor, KXORWrr>; - -def : Pat<(xor (xor VK16:$src1, VK16:$src2), (v16i1 immAllOnesV)), - (KXNORWrr VK16:$src1, VK16:$src2)>; -def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), - (KXNORBrr VK8:$src1, VK8:$src2)>, Requires<[HasDQI]>; -def : Pat<(xor (xor VK32:$src1, VK32:$src2), (v32i1 immAllOnesV)), - (KXNORDrr VK32:$src1, VK32:$src2)>, Requires<[HasBWI]>; -def : Pat<(xor (xor VK64:$src1, VK64:$src2), (v64i1 immAllOnesV)), - (KXNORQrr VK64:$src1, VK64:$src2)>, Requires<[HasBWI]>; - -let Predicates = [NoDQI] in -def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), - (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK8:$src1, VK16), - (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; - -def : Pat<(xor (xor VK4:$src1, VK4:$src2), (v4i1 immAllOnesV)), - (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK4:$src1, VK16), - (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>; - -def : Pat<(xor (xor VK2:$src1, VK2:$src2), (v2i1 immAllOnesV)), - (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK2:$src1, VK16), - (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>; - -def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)), - (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; +defm : avx512_binop_pat<and, and, KANDWrr>; +defm : avx512_binop_pat<vandn, andn, KANDNWrr>; +defm : avx512_binop_pat<or, or, KORWrr>; +defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>; +defm : avx512_binop_pat<xor, xor, KXORWrr>; // Mask unpacking multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, @@ -2466,6 +2637,8 @@ defm KSET1 : avx512_mask_setop_w<immAllOnesV>; // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; + def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>; + def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>; def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; @@ -2519,15 +2692,24 @@ def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>; -def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), - (v8i1 (COPY_TO_REGCLASS - (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), - (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; -def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))), - (v4i1 (COPY_TO_REGCLASS - (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16), - (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; +// Patterns for kmask shift +multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> { + def : Pat<(VT (X86vshli RC:$src, (i8 imm:$imm))), + (VT (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16), + (I8Imm $imm)), + RC))>; + def : Pat<(VT (X86vsrli RC:$src, (i8 imm:$imm))), + (VT (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16), + (I8Imm $imm)), + RC))>; +} + +defm : mask_shift_lowering<VK8, v8i1>, Requires<[HasAVX512, NoDQI]>; +defm : mask_shift_lowering<VK4, v4i1>, Requires<[HasAVX512]>; +defm : mask_shift_lowering<VK2, v2i1>, Requires<[HasAVX512]>; //===----------------------------------------------------------------------===// // AVX-512 - Aligned and unaligned load and store // @@ -2535,7 +2717,6 @@ def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))), multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload, - bit IsReMaterializable = 1, SDPatternOperator SelectOprr = vselect> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src), @@ -2545,12 +2726,12 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, (ins _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", "${dst} {${mask}} {z}, $src}"), - [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask, (_.VT _.RC:$src), _.ImmAllZerosV)))], _.ExeDomain>, EVEX, EVEX_KZ; - let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable, + let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -2598,37 +2779,32 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, - Predicate prd, - bit IsReMaterializable = 1> { + Predicate prd> { let Predicates = [prd] in defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag, - masked_load_aligned512, IsReMaterializable>, EVEX_V512; + masked_load_aligned512>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag, - masked_load_aligned256, IsReMaterializable>, EVEX_V256; + masked_load_aligned256>, EVEX_V256; defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag, - masked_load_aligned128, IsReMaterializable>, EVEX_V128; + masked_load_aligned128>, EVEX_V128; } } multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, - bit IsReMaterializable = 1, SDPatternOperator SelectOprr = vselect> { let Predicates = [prd] in defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag, - masked_load_unaligned, IsReMaterializable, - SelectOprr>, EVEX_V512; + masked_load_unaligned, SelectOprr>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag, - masked_load_unaligned, IsReMaterializable, - SelectOprr>, EVEX_V256; + masked_load_unaligned, SelectOprr>, EVEX_V256; defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag, - masked_load_unaligned, IsReMaterializable, - SelectOprr>, EVEX_V128; + masked_load_unaligned, SelectOprr>, EVEX_V128; } } @@ -2704,11 +2880,11 @@ defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, - 1, null_frag>, + null_frag>, avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>, PS, EVEX_CD8<32, CD8VF>; -defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0, +defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, null_frag>, avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -2732,15 +2908,41 @@ defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, - 1, null_frag>, + null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512>, XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, - 1, null_frag>, + null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; +// Special instructions to help with spilling when we don't have VLX. We need +// to load or store from a ZMM register instead. These are converted in +// expandPostRAPseudos. +let isReMaterializable = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in { +def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), + "", []>; +def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), + "", []>; +def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), + "", []>; +def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), + "", []>; +} + +let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src), + "", []>; +def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), + "", []>; +def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src), + "", []>; +def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), + "", []>; +} + def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), (v8i64 VR512:$src))), (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), @@ -2761,6 +2963,52 @@ def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)), (v16i32 VR512:$src))), (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>; +// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't +// available. Use a 512-bit operation and extract. +let Predicates = [HasAVX512, NoVLX] in { +def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), + (v8f32 VR256X:$src0))), + (EXTRACT_SUBREG + (v16f32 + (VMOVAPSZrrk + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)), + (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), + sub_ymm)>; + +def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), + (v8i32 VR256X:$src0))), + (EXTRACT_SUBREG + (v16i32 + (VMOVDQA32Zrrk + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)), + (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), + sub_ymm)>; +} + +let Predicates = [HasVLX, NoBWI] in { + // 128-bit load/store without BWI. + def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst), + (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst), + (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v8i16 VR128X:$src), addr:$dst), + (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v16i8 VR128X:$src), addr:$dst), + (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>; + + // 256-bit load/store without BWI. + def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst), + (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst), + (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v16i16 VR256X:$src), addr:$dst), + (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v32i8 VR256X:$src), addr:$dst), + (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>; +} + let Predicates = [HasVLX] in { // Special patterns for storing subvector extracts of lower 128-bits of 256. // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr @@ -2844,23 +3092,23 @@ let Predicates = [HasVLX] in { // Special patterns for storing subvector extracts of lower 256-bits of 512. // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore (v4f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), + def : Pat<(alignedstore256 (v4f64 (extract_subvector + (v8f64 VR512:$src), (iPTR 0))), addr:$dst), (VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; def : Pat<(alignedstore (v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), addr:$dst), (VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v4i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), + def : Pat<(alignedstore256 (v4i64 (extract_subvector + (v8i64 VR512:$src), (iPTR 0))), addr:$dst), (VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v8i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), + def : Pat<(alignedstore256 (v8i32 (extract_subvector + (v16i32 VR512:$src), (iPTR 0))), addr:$dst), (VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v16i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), + def : Pat<(alignedstore256 (v16i16 (extract_subvector + (v32i16 VR512:$src), (iPTR 0))), addr:$dst), (VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v32i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), + def : Pat<(alignedstore256 (v32i8 (extract_subvector + (v64i8 VR512:$src), (iPTR 0))), addr:$dst), (VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; def : Pat<(store (v4f64 (extract_subvector @@ -2886,6 +3134,7 @@ let Predicates = [HasVLX] in { // Move Int Doubleword to Packed Double Int // +let ExeDomain = SSEPackedInt in { def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, @@ -2921,10 +3170,11 @@ def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>, EVEX_CD8<64, CD8VT1>; } +} // ExeDomain = SSEPackedInt // Move Int Doubleword to Single Scalar // -let isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert GR32:$src))], @@ -2934,10 +3184,11 @@ def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$sr "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; -} +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 // Move doubleword from xmm register to r/m32 // +let ExeDomain = SSEPackedInt in { def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (extractelt (v4i32 VR128X:$src), @@ -2949,9 +3200,11 @@ def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), [(store (i32 (extractelt (v4i32 VR128X:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; +} // ExeDomain = SSEPackedInt // Move quadword from xmm1 register to r/m64 // +let ExeDomain = SSEPackedInt in { def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), @@ -2978,10 +3231,11 @@ def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src), "vmovq.s\t{$src, $dst|$dst, $src}",[]>, EVEX, VEX_W; +} // ExeDomain = SSEPackedInt // Move Scalar Single to Double Int // -let isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", @@ -2992,54 +3246,71 @@ def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), "vmovd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; -} +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 // Move Quadword Int to Packed Quadword Int // +let ExeDomain = SSEPackedInt in { def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; +} // ExeDomain = SSEPackedInt //===----------------------------------------------------------------------===// // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar <string asm, SDNode OpNode, +multiclass avx512_move_scalar<string asm, SDNode OpNode, X86VectorVTInfo _> { - defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), - asm, "$src2, $src1","$src1, $src2", - (_.VT (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2))), - IIC_SSE_MOV_S_RR>, EVEX_4V; - let Constraints = "$src1 = $dst" in - defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _, - (outs _.RC:$dst), - (ins _.ScalarMemOp:$src), - asm,"$src","$src", - (_.VT (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector - (_.ScalarLdFrag addr:$src)))))>, EVEX; - let isCodeGenOnly = 1 in { - def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, _.FRC:$src2), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, - (scalar_to_vector _.FRC:$src2))))], - _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V; - def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], - _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX; + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.FRC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, + (scalar_to_vector _.FRC:$src2))))], + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V; + def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|", + "$dst {${mask}} {z}, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask, + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + _.ImmAllZerosV)))], + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ; + let Constraints = "$src0 = $dst" in + def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask, + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + (_.VT _.RC:$src0))))], + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K; + let canFoldAsLoad = 1, isReMaterializable = 1 in + def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX; + let mayLoad = 1, hasSideEffects = 0 in { + let Constraints = "$src0 = $dst" in + def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|", + "$dst {${mask}}, $src}"), + [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K; + def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst {${mask}} {z}|", + "$dst {${mask}} {z}, $src}"), + [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ; } def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX; - let mayStore = 1 in + let mayStore = 1, hasSideEffects = 0 in def mrk: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), @@ -3052,12 +3323,99 @@ defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode, + PatLeaf ZeroFP, X86VectorVTInfo _> { + +def : Pat<(_.VT (OpNode _.RC:$src0, + (_.VT (scalar_to_vector + (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT _.FRC:$src1), + (_.EltVT _.FRC:$src2))))))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk) + (COPY_TO_REGCLASS _.FRC:$src2, _.RC), + (COPY_TO_REGCLASS GR32:$mask, VK1WM), + (_.VT _.RC:$src0), + (COPY_TO_REGCLASS _.FRC:$src1, _.RC)), + _.RC)>; + +def : Pat<(_.VT (OpNode _.RC:$src0, + (_.VT (scalar_to_vector + (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT _.FRC:$src1), + (_.EltVT ZeroFP))))))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz) + (COPY_TO_REGCLASS GR32:$mask, VK1WM), + (_.VT _.RC:$src0), + (COPY_TO_REGCLASS _.FRC:$src1, _.RC)), + _.RC)>; + +} + +multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, + dag Mask, RegisterClass MaskRC> { + +def : Pat<(masked_store addr:$dst, Mask, + (_.info512.VT (insert_subvector undef, + (_.info256.VT (insert_subvector undef, + (_.info128.VT _.info128.RC:$src), + (i64 0))), + (i64 0)))), + (!cast<Instruction>(InstrStr#mrk) addr:$dst, + (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + +} + +multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, + dag Mask, RegisterClass MaskRC> { + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (bitconvert + (v16i32 immAllZerosV))))), + (i64 0))), + (!cast<Instruction>(InstrStr#rmkz) + (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + addr:$srcAddr)>; + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (insert_subvector undef, + (_.info256.VT (insert_subvector undef, + (_.info128.VT (X86vzmovl _.info128.RC:$src)), + (i64 0))), + (i64 0))))), + (i64 0))), + (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, + (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + addr:$srcAddr)>; + +} + +defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>; +defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>; + +defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; +defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>; +defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>; + +defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; +defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>; +defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>; + def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), - (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X), + (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), - (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X), + (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), @@ -3088,6 +3446,7 @@ let Predicates = [HasAVX512] in { (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))), (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>; + } // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), @@ -3097,8 +3456,15 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), (SUBREG_TO_REG (i32 0), (VMOVSSZrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; - } + (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>; let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this @@ -3109,6 +3475,8 @@ let Predicates = [HasAVX512] in { (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; // MOVSDrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -3131,6 +3499,8 @@ let Predicates = [HasAVX512] in { def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; @@ -3145,6 +3515,8 @@ let Predicates = [HasAVX512] in { def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v16f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; @@ -3168,10 +3540,17 @@ let Predicates = [HasAVX512] in { (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2f64 (V_SET0)), (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)), (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)), + (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>; // Extract and store. def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), @@ -3238,15 +3617,6 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (v2i64 VR128X:$src))))], IIC_SSE_MOVQ_RR>, EVEX, VEX_W; -let AddedComplexity = 20 , isCodeGenOnly = 1 in -def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), - (ins i128mem:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(set VR128X:$dst, (v2i64 (X86vzmovl - (loadv2i64 addr:$src))))], - IIC_SSE_MOVDQ>, EVEX, VEX_W, - EVEX_CD8<8, CD8VT8>; - let Predicates = [HasAVX512] in { let AddedComplexity = 15 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), @@ -3258,34 +3628,46 @@ let Predicates = [HasAVX512] in { def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; + + def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; } // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. let AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v8i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), - (VMOVZPQILo2PQIZrm addr:$src)>; + (VMOVQI2PQIZrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), - (VMOVZPQILo2PQIZrr VR128X:$src)>; + (VMOVZPQILo2PQIZrr VR128X:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), - (VMOVZPQILo2PQIZrm addr:$src)>; + (VMOVQI2PQIZrm addr:$src)>; def : Pat<(v4i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; } // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; + def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. + def : Pat<(v16i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; def : Pat<(v8i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; } def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), @@ -3366,11 +3748,11 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in { (VMOVNTDQAZrm addr:$src)>; def : Pat<(v8i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZrm addr:$src)>; - def : Pat<(v16i32 (alignednontemporalload addr:$src)), + def : Pat<(v16i32 (bitconvert (v8i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZrm addr:$src)>; - def : Pat<(v32i16 (alignednontemporalload addr:$src)), + def : Pat<(v32i16 (bitconvert (v8i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZrm addr:$src)>; - def : Pat<(v64i8 (alignednontemporalload addr:$src)), + def : Pat<(v64i8 (bitconvert (v8i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZrm addr:$src)>; } @@ -3388,11 +3770,11 @@ let Predicates = [HasVLX], AddedComplexity = 400 in { (VMOVNTDQAZ256rm addr:$src)>; def : Pat<(v4i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v8i32 (alignednontemporalload addr:$src)), + def : Pat<(v8i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v16i16 (alignednontemporalload addr:$src)), + def : Pat<(v16i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v32i8 (alignednontemporalload addr:$src)), + def : Pat<(v32i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ256rm addr:$src)>; def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst), @@ -3408,11 +3790,11 @@ let Predicates = [HasVLX], AddedComplexity = 400 in { (VMOVNTDQAZ128rm addr:$src)>; def : Pat<(v2i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZ128rm addr:$src)>; - def : Pat<(v4i32 (alignednontemporalload addr:$src)), + def : Pat<(v4i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ128rm addr:$src)>; - def : Pat<(v8i16 (alignednontemporalload addr:$src)), + def : Pat<(v8i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ128rm addr:$src)>; - def : Pat<(v16i8 (alignednontemporalload addr:$src)), + def : Pat<(v16i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ128rm addr:$src)>; } @@ -3563,10 +3945,10 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, AVX512BIBase, EVEX_4V; defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), - (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), + (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_Brdct.BroadcastStr##", $src1", - "$src1, ${src2}"##_Dst.BroadcastStr, + "$src1, ${src2}"##_Brdct.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Brdct.VT (X86VBroadcast (_Brdct.ScalarLdFrag addr:$src2)))))), @@ -3646,13 +4028,14 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,X86VectorVTInfo _Src, - X86VectorVTInfo _Dst> { + X86VectorVTInfo _Dst, bit IsCommutable = 0> { defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, "$src2, $src1","$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), - (_Src.VT _Src.RC:$src2)))>, + (_Src.VT _Src.RC:$src2))), + NoItinerary, IsCommutable>, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, @@ -3695,15 +4078,15 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr, multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo _Src, - AVX512VLVectorVTInfo _Dst> { + AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> { let Predicates = [HasBWI] in defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512, - _Dst.info512>, EVEX_V512; + _Dst.info512, IsCommutable>, EVEX_V512; let Predicates = [HasBWI, HasVLX] in { defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256, - _Dst.info256>, EVEX_V256; + _Dst.info256, IsCommutable>, EVEX_V256; defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128, - _Dst.info128>, EVEX_V128; + _Dst.info128, IsCommutable>, EVEX_V128; } } @@ -3715,7 +4098,7 @@ defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512B defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw, avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD; defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd, - avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase; + avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase; defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; @@ -3744,17 +4127,119 @@ defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasDQI, NoVLX] in { + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG + (VPMULLQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG + (VPMULLQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; +} + //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and, +multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> { + defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)), + (bitconvert (_.VT _.RC:$src2)))), + (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1, + _.RC:$src2)))), + itins.rr, IsCommutable>, + AVX512BIBase, EVEX_4V; + + defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)), + (bitconvert (_.LdFrag addr:$src2)))), + (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2)))))), + itins.rm>, + AVX512BIBase, EVEX_4V; +} + +multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> : + avx512_logic_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> { + defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.i64VT (OpNode _.RC:$src1, + (bitconvert + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))), + (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1, + (bitconvert + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))))), + itins.rm>, + AVX512BIBase, EVEX_4V, EVEX_B; +} + +multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins, + IsCommutable>, EVEX_V128; + } +} + +multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info, + itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>; +} + +multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info, + itins, prd, IsCommutable>, + VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd, + IsCommutable>; + + defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd, + IsCommutable>; +} + +defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, SSE_INTALU_ITINS_P, HasAVX512, 1>; -defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, +defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, SSE_INTALU_ITINS_P, HasAVX512, 1>; -defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, +defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, SSE_INTALU_ITINS_P, HasAVX512, 1>; -defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, +defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, SSE_INTALU_ITINS_P, HasAVX512, 0>; //===----------------------------------------------------------------------===// @@ -3763,13 +4248,13 @@ defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode, SDNode VecNode, OpndItins itins, bit IsCommutable> { - + let ExeDomain = _.ExeDomain in { defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 FROUND_CURRENT)), - itins.rr, IsCommutable>; + itins.rr>; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, @@ -3777,25 +4262,27 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (VecNode (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT)), - itins.rm, IsCommutable>; - let isCodeGenOnly = 1, isCommutable = IsCommutable, - Predicates = [HasAVX512] in { + itins.rm>; + let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], - itins.rr>; + itins.rr> { + let isCommutable = IsCommutable; + } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))], itins.rm>; } + } } multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode VecNode, OpndItins itins, bit IsCommutable = 0> { - + let ExeDomain = _.ExeDomain in defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", @@ -3805,7 +4292,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo } multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode VecNode, OpndItins itins, bit IsCommutable> { - + let ExeDomain = _.ExeDomain in defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", @@ -3843,9 +4330,9 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>; -defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>; +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_MUL_ITINS_S, 1>; defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>; -defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>; +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_DIV_ITINS_S, 0>; defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>; defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>; @@ -3853,12 +4340,14 @@ defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITIN // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, OpndItins itins> { - let isCodeGenOnly = 1, isCommutable =1, Predicates = [HasAVX512] in { + let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], - itins.rr>; + itins.rr> { + let isCommutable = 1; + } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -3882,27 +4371,35 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; -multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable> { +multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable> { + let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V; - defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, - "$src2, $src1", "$src1, $src2", - (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V; - defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, - EVEX_4V, EVEX_B; + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr, + IsCommutable>, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>, + EVEX_4V; + defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))), + itins.rm>, EVEX_4V, EVEX_B; + } + } } -multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, - X86VectorVTInfo _> { +multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, "$rc, $src2, $src1", "$src1, $src2, $rc", @@ -3911,8 +4408,9 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRn } -multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, - X86VectorVTInfo _> { +multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "{sae}, $src2, $src1", "$src1, $src2, {sae}", @@ -3920,30 +4418,31 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, EVEX_4V, EVEX_B; } -multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - Predicate prd, bit IsCommutable = 0> { +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, + Predicate prd, SizeItins itins, + bit IsCommutable = 0> { let Predicates = [prd] in { defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, - IsCommutable>, EVEX_V512, PS, + itins.s, IsCommutable>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info, - IsCommutable>, EVEX_V512, PD, VEX_W, + itins.d, IsCommutable>, EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; } // Define only if AVX512VL feature is present. let Predicates = [prd, HasVLX] in { defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info, - IsCommutable>, EVEX_V128, PS, + itins.s, IsCommutable>, EVEX_V128, PS, EVEX_CD8<32, CD8VF>; defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info, - IsCommutable>, EVEX_V256, PS, + itins.s, IsCommutable>, EVEX_V256, PS, EVEX_CD8<32, CD8VF>; defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info, - IsCommutable>, EVEX_V128, PD, VEX_W, + itins.d, IsCommutable>, EVEX_V128, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info, - IsCommutable>, EVEX_V256, PD, VEX_W, + itins.d, IsCommutable>, EVEX_V256, PD, VEX_W, EVEX_CD8<64, CD8VF>; } } @@ -3962,26 +4461,140 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } -defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512, 1>, +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512, + SSE_ALU_ITINS_P, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; -defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512, 1>, +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512, + SSE_MUL_ITINS_P, 1>, avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>; -defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512>, +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; -defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512>, +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; -defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, 0>, +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, + SSE_ALU_ITINS_P, 0>, avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>; -defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, 0>, +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, + SSE_ALU_ITINS_P, 0>, avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>; let isCodeGenOnly = 1 in { - defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, 1>; - defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512, 1>; + defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, + SSE_ALU_ITINS_P, 1>; + defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512, + SSE_ALU_ITINS_P, 1>; +} +defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI, + SSE_ALU_ITINS_P, 1>; +defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI, + SSE_ALU_ITINS_P, 0>; +defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI, + SSE_ALU_ITINS_P, 1>; +defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI, + SSE_ALU_ITINS_P, 1>; + +// Patterns catch floating point selects with bitcasted integer logic ops. +multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode, + X86VectorVTInfo _, Predicate prd> { +let Predicates = [prd] in { + // Masked register-register logical operations. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, _.RC:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1, + _.RC:$src2)>; + // Masked register-memory logical operations. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (_.i64VT (OpNode _.RC:$src1, + (load addr:$src2)))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1, + addr:$src2)>; + // Register-broadcast logical operations. + def : Pat<(_.i64VT (OpNode _.RC:$src1, + (bitconvert (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))), + (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert + (_.i64VT (OpNode _.RC:$src1, + (bitconvert (_.VT + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert + (_.i64VT (OpNode _.RC:$src1, + (bitconvert (_.VT + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; +} +} + +multiclass avx512_fp_logical_lowering_sizes<string InstrStr, SDNode OpNode> { + defm : avx512_fp_logical_lowering<InstrStr#DZ128, OpNode, v4f32x_info, HasVLX>; + defm : avx512_fp_logical_lowering<InstrStr#QZ128, OpNode, v2f64x_info, HasVLX>; + defm : avx512_fp_logical_lowering<InstrStr#DZ256, OpNode, v8f32x_info, HasVLX>; + defm : avx512_fp_logical_lowering<InstrStr#QZ256, OpNode, v4f64x_info, HasVLX>; + defm : avx512_fp_logical_lowering<InstrStr#DZ, OpNode, v16f32_info, HasAVX512>; + defm : avx512_fp_logical_lowering<InstrStr#QZ, OpNode, v8f64_info, HasAVX512>; +} + +defm : avx512_fp_logical_lowering_sizes<"VPAND", and>; +defm : avx512_fp_logical_lowering_sizes<"VPOR", or>; +defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>; +defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>; + +let Predicates = [HasVLX,HasDQI] in { + // Use packed logical operations for scalar ops. + def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)), + (COPY_TO_REGCLASS (VANDPDZ128rr + (COPY_TO_REGCLASS FR64X:$src1, VR128X), + (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>; + def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)), + (COPY_TO_REGCLASS (VORPDZ128rr + (COPY_TO_REGCLASS FR64X:$src1, VR128X), + (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>; + def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)), + (COPY_TO_REGCLASS (VXORPDZ128rr + (COPY_TO_REGCLASS FR64X:$src1, VR128X), + (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>; + def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)), + (COPY_TO_REGCLASS (VANDNPDZ128rr + (COPY_TO_REGCLASS FR64X:$src1, VR128X), + (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>; + + def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)), + (COPY_TO_REGCLASS (VANDPSZ128rr + (COPY_TO_REGCLASS FR32X:$src1, VR128X), + (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>; + def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)), + (COPY_TO_REGCLASS (VORPSZ128rr + (COPY_TO_REGCLASS FR32X:$src1, VR128X), + (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>; + def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)), + (COPY_TO_REGCLASS (VXORPSZ128rr + (COPY_TO_REGCLASS FR32X:$src1, VR128X), + (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>; + def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)), + (COPY_TO_REGCLASS (VANDNPSZ128rr + (COPY_TO_REGCLASS FR32X:$src1, VR128X), + (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>; } -defm VAND : avx512_fp_binop_p<0x54, "vand", X86fand, HasDQI, 1>; -defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, HasDQI, 0>; -defm VOR : avx512_fp_binop_p<0x56, "vor", X86for, HasDQI, 1>; -defm VXOR : avx512_fp_binop_p<0x57, "vxor", X86fxor, HasDQI, 1>; multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { @@ -4157,6 +4770,7 @@ defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8X //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4168,10 +4782,12 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i8 imm:$src2))), SSE_INTSHIFT_ITINS_P.rm>; + } } multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", @@ -4182,6 +4798,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { // src2 is always 128-bit + let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, VR128X:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4193,6 +4810,7 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V; + } } multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -4286,6 +4904,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4298,10 +4917,12 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (bitconvert (_.LdFrag addr:$src2))))), SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + } } multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", @@ -4375,9 +4996,6 @@ defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, avx512_var_shift_w<0x11, "vpsravw", sra>, avx512_var_shift_w_lowering<avx512vl_i16_info, sra>; -let isCodeGenOnly = 1 in - defm VPSRAV_Int : avx512_var_shift_types<0x46, "vpsrav", X86vsrav>, - avx512_var_shift_w<0x11, "vpsravw", X86vsrav>; defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, avx512_var_shift_w<0x10, "vpsrlvw", srl>, @@ -4385,6 +5003,76 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; +// Special handing for handling VPSRAV intrinsics. +multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _, + list<Predicate> p> { + let Predicates = p in { + def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)), + (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1, + _.RC:$src2)>; + def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), + (!cast<Instruction>(InstrStr#_.ZSuffix##rm) + _.RC:$src1, addr:$src2)>; + let AddedComplexity = 20 in { + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)), + (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0, + _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0, + _.KRC:$mask, _.RC:$src1, addr:$src2)>; + } + let AddedComplexity = 30 in { + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask, + _.RC:$src1, _.RC:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask, + _.RC:$src1, addr:$src2)>; + } + } +} + +multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _, + list<Predicate> p> : + avx512_var_shift_int_lowering<InstrStr, _, p> { + let Predicates = p in { + def : Pat<(_.VT (X86vsrav _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmb) + _.RC:$src1, addr:$src2)>; + let AddedComplexity = 20 in + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src2))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0, + _.KRC:$mask, _.RC:$src1, addr:$src2)>; + let AddedComplexity = 30 in + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src2))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask, + _.RC:$src1, addr:$src2)>; + } +} + +defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>; +defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>; +defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>; + //===-------------------------------------------------------------------===// // 1-src variable permutation VPERMW/D/Q //===-------------------------------------------------------------------===// @@ -4501,8 +5189,10 @@ multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar, EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; } +let ExeDomain = SSEPackedSingle in defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, avx512vl_i32_info>; +let ExeDomain = SSEPackedDouble in defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// @@ -4666,61 +5356,71 @@ let Predicates = [HasAVX512] in { // FMA - Fused Multiply Operations // -let Constraints = "$src1 = $dst" in { multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, AVX512FMA3Base; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, + (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>, AVX512FMA3Base; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), - (OpNode _.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, + (OpNode _.RC:$src2, + _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, AVX512FMA3Base, EVEX_B; + } + + // Additional pattern for folding broadcast nodes in other orders. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, _.RC:$src2, + (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + _.RC:$src1)), + (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; } multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>, + (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC; } -} // Constraints = "$src1 = $dst" multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + SDNode OpNodeRnd, AVX512VLVectorVTInfo _, + string Suff> { let Predicates = [HasAVX512] in { - defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512>, - avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512>, - EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512, Suff>, + avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512, + Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } let Predicates = [HasVLX, HasAVX512] in { - defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256>, + defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256, Suff>, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; - defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128>, + defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128, Suff>, EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd > { + SDNode OpNodeRnd > { defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, - avx512vl_f32_info>; + avx512vl_f32_info, "PS">; defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, - avx512vl_f64_info>, VEX_W; + avx512vl_f64_info, "PD">, VEX_W; } defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>; @@ -4731,19 +5431,19 @@ defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddR defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>; -let Constraints = "$src1 = $dst" in { multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>, + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>, AVX512FMA3Base; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>, AVX512FMA3Base; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -4752,40 +5452,60 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src2, (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - _.RC:$src1))>, AVX512FMA3Base, EVEX_B; + _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B; + } + + // Additional patterns for folding broadcast nodes in other orders. + def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1)), + (!cast<Instruction>(NAME#Suff#_.ZSuffix#mb) _.RC:$src1, + _.RC:$src2, addr:$src3)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1), + _.RC:$src1)), + (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1), + _.ImmAllZerosV)), + (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbkz) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; } multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>, + (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC; } -} // Constraints = "$src1 = $dst" multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + SDNode OpNodeRnd, AVX512VLVectorVTInfo _, + string Suff> { let Predicates = [HasAVX512] in { - defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512>, - avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512>, - EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512, Suff>, + avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512, + Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } let Predicates = [HasVLX, HasAVX512] in { - defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256>, + defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256, Suff>, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; - defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128>, + defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128, Suff>, EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd > { + SDNode OpNodeRnd > { defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, - avx512vl_f32_info>; + avx512vl_f32_info, "PS">; defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, - avx512vl_f64_info>, VEX_W; + avx512vl_f64_info, "PD">, VEX_W; } defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>; @@ -4795,61 +5515,71 @@ defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86Fms defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>; defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>; -let Constraints = "$src1 = $dst" in { multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src3, _.RC:$src2), - OpcodeStr, "$src2, $src3", "$src3, $src2", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>, AVX512FMA3Base; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src3, _.MemOp:$src2), - OpcodeStr, "$src2, $src3", "$src3, $src2", - (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3))>, + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src3), _.RC:$src2)), 1, 0>, AVX512FMA3Base; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src3, _.ScalarMemOp:$src2), - OpcodeStr, "${src2}"##_.BroadcastStr##", $src3", - "$src3, ${src2}"##_.BroadcastStr, + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - _.RC:$src3))>, AVX512FMA3Base, EVEX_B; + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B; + } + + // Additional patterns for folding broadcast nodes in other orders. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src1, _.RC:$src2), + _.RC:$src1)), + (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; } multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src3, _.RC:$src2, AVX512RC:$rc), - OpcodeStr, "$rc, $src2, $src3", "$src3, $src2, $rc", - (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>, + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC; } -} // Constraints = "$src1 = $dst" multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + SDNode OpNodeRnd, AVX512VLVectorVTInfo _, + string Suff> { let Predicates = [HasAVX512] in { - defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512>, - avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512>, - EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512, Suff>, + avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512, + Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } let Predicates = [HasVLX, HasAVX512] in { - defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256>, + defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256, Suff>, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; - defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128>, + defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128, Suff>, EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd > { + SDNode OpNodeRnd > { defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, - avx512vl_f32_info>; + avx512vl_f32_info, "PS">; defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, - avx512vl_f64_info>, VEX_W; + avx512vl_f64_info, "PD">, VEX_W; } defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>; @@ -4866,18 +5596,18 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, dag RHS_r, dag RHS_m > { defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, - "$src3, $src2", "$src2, $src3", RHS_VEC_r>, AVX512FMA3Base; + "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base; defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, - "$src3, $src2", "$src2, $src3", RHS_VEC_m>, AVX512FMA3Base; + "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base; defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), - OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb>, + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC; - let isCodeGenOnly = 1 in { + let isCodeGenOnly = 1, isCommutable = 1 in { def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3), !strconcat(OpcodeStr, @@ -4893,38 +5623,40 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, }// Constraints = "$src1 = $dst" multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ , - string SUFF> { - - defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ , - (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, + SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> { + + defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ , + // Operands for intrinsic are in 123 order to preserve passthu + // semantics. + (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))), + (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))), - (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, + (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, _.FRC:$src3))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, (_.ScalarLdFrag addr:$src3))))>; - defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ , - (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnd _.RC:$src2, + defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ , + (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))), + (_.VT (OpNodeRnds3 _.RC:$src2, (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), _.RC:$src1, (i32 FROUND_CURRENT))), - (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, + (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, _.FRC:$src1))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>; - defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ , - (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnd _.RC:$src1, + defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ , + (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))), + (_.VT (OpNodeRnds1 _.RC:$src1, (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), _.RC:$src2, (i32 FROUND_CURRENT))), - (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, + (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, _.FRC:$src2))), @@ -4933,21 +5665,26 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, } multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{ + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, + SDNode OpNodeRnds3> { let Predicates = [HasAVX512] in { defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, - OpNodeRnd, f32x_info, "SS">, - EVEX_CD8<32, CD8VT1>, VEX_LIG; + OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">, + EVEX_CD8<32, CD8VT1>, VEX_LIG; defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, - OpNodeRnd, f64x_info, "SD">, - EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; + OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">, + EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>; -defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; -defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1, + X86FmaddRnds3>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1, + X86FmsubRnds3>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, + X86FnmaddRnds1, X86FnmaddRnds3>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, + X86FnmsubRnds1, X86FnmsubRnds3>; //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA @@ -5067,6 +5804,11 @@ defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; +def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>; +def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>; + def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), @@ -5098,6 +5840,11 @@ defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; +def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>; +def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>; + def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))), @@ -5170,106 +5917,158 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, // Therefore, the SSE intrinsics are mapped to the AVX512 instructions. let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))), - (VCVTSS2SIZrr (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VCVTSS2SIZrr VR128X:$src)>; + def : Pat<(i32 (int_x86_sse_cvtss2si (sse_load_f32 addr:$src))), + (VCVTSS2SIZrm addr:$src)>; def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))), - (VCVTSS2SI64Zrr (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VCVTSS2SI64Zrr VR128X:$src)>; + def : Pat<(i64 (int_x86_sse_cvtss2si64 (sse_load_f32 addr:$src))), + (VCVTSS2SI64Zrm addr:$src)>; def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))), - (VCVTSD2SIZrr (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + (VCVTSD2SIZrr VR128X:$src)>; + def : Pat<(i32 (int_x86_sse2_cvtsd2si (sse_load_f64 addr:$src))), + (VCVTSD2SIZrm addr:$src)>; def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))), - (VCVTSD2SI64Zrr (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + (VCVTSD2SI64Zrr VR128X:$src)>; + def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (sse_load_f64 addr:$src))), + (VCVTSD2SI64Zrm addr:$src)>; } // HasAVX512 -let isCodeGenOnly = 1 , Predicates = [HasAVX512] in { - defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, - int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", - SSE_CVT_Scalar, 0>, XS, EVEX_4V; - defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, - int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", - SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W; - defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, - int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", - SSE_CVT_Scalar, 0>, XD, EVEX_4V; - defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, - int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", - SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W; - - defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x7B, GR32, VR128X, - int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}", - SSE_CVT_Scalar, 0>, XD, EVEX_4V; -} // isCodeGenOnly = 1, Predicates = [HasAVX512] +let Predicates = [HasAVX512] in { + def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, GR32:$src2), + (VCVTSI2SSZrr_Int VR128X:$src1, GR32:$src2)>; + def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, (loadi32 addr:$src2)), + (VCVTSI2SSZrm_Int VR128X:$src1, addr:$src2)>; + def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, GR64:$src2), + (VCVTSI642SSZrr_Int VR128X:$src1, GR64:$src2)>; + def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, (loadi64 addr:$src2)), + (VCVTSI642SSZrm_Int VR128X:$src1, addr:$src2)>; + def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, GR32:$src2), + (VCVTSI2SDZrr_Int VR128X:$src1, GR32:$src2)>; + def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, (loadi32 addr:$src2)), + (VCVTSI2SDZrm_Int VR128X:$src1, addr:$src2)>; + def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, GR64:$src2), + (VCVTSI642SDZrr_Int VR128X:$src1, GR64:$src2)>; + def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, (loadi64 addr:$src2)), + (VCVTSI642SDZrm_Int VR128X:$src1, addr:$src2)>; + def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, GR32:$src2), + (VCVTUSI2SDZrr_Int VR128X:$src1, GR32:$src2)>; + def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, (loadi32 addr:$src2)), + (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>; +} // Predicates = [HasAVX512] + +// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang +// which produce unnecessary vmovs{s,d} instructions +let Predicates = [HasAVX512] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>; +} // Predicates = [HasAVX512] // Convert float/double to signed/unsigned int 32/64 with truncation multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, - SDNode OpNodeRnd>{ + SDNode OpNodeRnd, string aliasStr>{ let Predicates = [HasAVX512] in { - def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX; - def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + let hasSideEffects = 0 in + def rb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), []>, EVEX, EVEX_B; - def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src), + def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, EVEX; + def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>; + def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst|$dst, $src, {sae}}", + (!cast<Instruction>(NAME # "rb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>; + def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst, + _SrcRC.ScalarMemOp:$src), 0>; + let isCodeGenOnly = 1 in { - def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), - (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; - def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), - !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), - [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), - (i32 FROUND_NO_EXC)))]>, - EVEX,VEX_LIG , EVEX_B; - let mayLoad = 1, hasSideEffects = 0 in - def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), - (ins _SrcRC.MemOp:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - []>, EVEX, VEX_LIG; + def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), + (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; + def rb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), + (i32 FROUND_NO_EXC)))]>, + EVEX,VEX_LIG , EVEX_B; + let mayLoad = 1, hasSideEffects = 0 in + def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), + (ins _SrcRC.MemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + []>, EVEX, VEX_LIG; } // isCodeGenOnly = 1 } //HasAVX512 } -defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, - fp_to_sint,X86cvtts2IntRnd>, +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info, + fp_to_sint, X86cvtts2IntRnd, "{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, - fp_to_sint,X86cvtts2IntRnd>, +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info, + fp_to_sint, X86cvtts2IntRnd, "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, - fp_to_sint,X86cvtts2IntRnd>, +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info, + fp_to_sint, X86cvtts2IntRnd, "{l}">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, - fp_to_sint,X86cvtts2IntRnd>, +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info, + fp_to_sint, X86cvtts2IntRnd, "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, - fp_to_uint,X86cvtts2UIntRnd>, +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info, + fp_to_uint, X86cvtts2UIntRnd, "{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, - fp_to_uint,X86cvtts2UIntRnd>, +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info, + fp_to_uint, X86cvtts2UIntRnd, "{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, - fp_to_uint,X86cvtts2UIntRnd>, +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info, + fp_to_uint, X86cvtts2UIntRnd, "{l}">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, - fp_to_uint,X86cvtts2UIntRnd>, +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info, + fp_to_uint, X86cvtts2UIntRnd, "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), - (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VCVTTSS2SIZrr_Int VR128X:$src)>; + def : Pat<(i32 (int_x86_sse_cvttss2si (sse_load_f32 addr:$src))), + (VCVTTSS2SIZrm_Int addr:$src)>; def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))), - (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VCVTTSS2SI64Zrr_Int VR128X:$src)>; + def : Pat<(i64 (int_x86_sse_cvttss2si64 (sse_load_f32 addr:$src))), + (VCVTTSS2SI64Zrm_Int addr:$src)>; def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))), - (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + (VCVTTSD2SIZrr_Int VR128X:$src)>; + def : Pat<(i32 (int_x86_sse2_cvttsd2si (sse_load_f64 addr:$src))), + (VCVTTSD2SIZrm_Int addr:$src)>; def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))), - (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; - + (VCVTTSD2SI64Zrr_Int VR128X:$src)>; + def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (sse_load_f64 addr:$src))), + (VCVTTSD2SI64Zrm_Int addr:$src)>; } // HasAVX512 //===----------------------------------------------------------------------===// // AVX-512 Convert form float to double and back @@ -5280,14 +6079,16 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _ (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), - (_Src.VT _Src.RC:$src2)))>, + (_Src.VT _Src.RC:$src2), + (i32 FROUND_CURRENT)))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), (_Src.VT (scalar_to_vector - (_Src.ScalarLdFrag addr:$src2)))))>, + (_Src.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT)))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } @@ -5314,36 +6115,35 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_B, EVEX_RC; } -multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, - OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, - EVEX_V512, XD; + OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, XD; } } -multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, - EVEX_CD8<32, CD8VT1>, XS, EVEX_V512; + EVEX_CD8<32, CD8VT1>, XS; } } -defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround, +defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86froundRnd, f64x_info, f32x_info>; -defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpextRnd,f32x_info, f64x_info >; -def : Pat<(f64 (fextend FR32X:$src)), +def : Pat<(f64 (fpextend FR32X:$src)), (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; -def : Pat<(f64 (fextend (loadf32 addr:$src))), +def : Pat<(f64 (fpextend (loadf32 addr:$src))), (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[HasAVX512]>; @@ -5356,10 +6156,25 @@ def : Pat<(f64 (extloadf32 addr:$src)), (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, Requires<[HasAVX512, OptForSpeed]>; -def : Pat<(f32 (fround FR64X:$src)), +def : Pat<(f32 (fpround FR64X:$src)), (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // AVX-512 Vector convert from signed/unsigned integer to float/double // and from float/double to signed/unsigned integer @@ -5368,14 +6183,14 @@ def : Pat<(f32 (fround FR64X:$src)), multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode, string Broadcast = _.BroadcastStr, - string Alias = ""> { + string Alias = "", X86MemOperand MemOp = _Src.MemOp> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src), OpcodeStr, "$src", "$src", (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _Src.MemOp:$src), OpcodeStr#Alias, "$src", "$src", + (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src", (_.VT (OpNode (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX; @@ -5410,14 +6225,14 @@ multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, // Extend Float to Double multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info, X86vfpextRnd>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info, - X86vfpext, "{1to2}">, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>, + X86vfpext, "{1to2}", "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend>, EVEX_V256; } } @@ -5425,15 +6240,24 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> { // Truncate Double to Float multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfproundRnd>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info, X86vfpround, "{1to2}", "{x}">, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround, + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround, "{1to4}", "{y}">, EVEX_V256; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>; } } @@ -5446,6 +6270,12 @@ def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; let Predicates = [HasVLX] in { + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), + (VCVTPD2PSZ128rr VR128X:$src)>; + def : Pat<(v2f64 (extloadv2f32 addr:$src)), + (VCVTPS2PDZ128rm addr:$src)>; def : Pat<(v4f64 (extloadv4f32 addr:$src)), (VCVTPS2PDZ256rm addr:$src)>; } @@ -5460,7 +6290,7 @@ multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info, - OpNode128, "{1to2}">, EVEX_V128; + OpNode128, "{1to2}", "", i64mem>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>, EVEX_V256; } @@ -5515,8 +6345,8 @@ multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, } // Convert Double to Signed/Unsigned Doubleword with truncation -multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128, SDNode OpNodeRnd> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, @@ -5524,13 +6354,22 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, } let Predicates = [HasVLX] in { // we need "x"/"y" suffixes in order to distinguish between 128 and 256 - // memory forms of these instructions in Asm Parcer. They have the same + // memory forms of these instructions in Asm Parser. They have the same // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode, - "{1to2}", "{x}">, EVEX_V128; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, + OpNode128, "{1to2}", "{x}">, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, "{1to4}", "{y}">, EVEX_V256; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>; } } @@ -5551,6 +6390,15 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, "{1to2}", "{x}">, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, "{1to4}", "{y}">, EVEX_V256; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>; } } @@ -5614,15 +6462,15 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - "{1to2}">, EVEX_V128; + "{1to2}", "", f64mem>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>, EVEX_V256; } } // Convert Float to Signed/Unsigned Quardword with truncation -multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128, SDNode OpNodeRnd> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info, @@ -5631,16 +6479,16 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, let Predicates = [HasDQI, HasVLX] in { // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - "{1to2}">, EVEX_V128; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128, + "{1to2}", "", f64mem>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>, EVEX_V256; } } // Convert Signed/Unsigned Quardword to Float -multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128, SDNode OpNodeRnd> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info, @@ -5651,37 +6499,46 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, // memory forms of these instructions in Asm Parcer. They have the same // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode, + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128, "{1to2}", "{x}">, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode, "{1to4}", "{y}">, EVEX_V256; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>; } } -defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86cvtdq2pd>, XS, - EVEX_CD8<32, CD8VH>; +defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP>, + XS, EVEX_CD8<32, CD8VH>; defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, X86VSintToFpRnd>, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint, - X86VFpToSintRnd>, + X86cvttp2siRnd>, XS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, - X86VFpToSintRnd>, +defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si, + X86cvttp2siRnd>, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint, - X86VFpToUintRnd>, PS, + X86cvttp2uiRnd>, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, - X86VFpToUintRnd>, PS, VEX_W, + X86cvttp2ui, X86cvttp2uiRnd>, PS, VEX_W, EVEX_CD8<64, CD8VF>; -defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86cvtudq2pd>, +defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86VUintToFP>, XS, EVEX_CD8<32, CD8VH>; defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp, @@ -5717,18 +6574,18 @@ defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, X86cvtp2UIntRnd>, PD, EVEX_CD8<32, CD8VH>; defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint, - X86VFpToSintRnd>, VEX_W, + X86cvttp2siRnd>, VEX_W, PD, EVEX_CD8<64, CD8VF>; -defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, - X86VFpToSintRnd>, PD, EVEX_CD8<32, CD8VH>; +defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si, + X86cvttp2siRnd>, PD, EVEX_CD8<32, CD8VH>; defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint, - X86VFpToUintRnd>, VEX_W, + X86cvttp2uiRnd>, VEX_W, PD, EVEX_CD8<64, CD8VF>; -defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, - X86VFpToUintRnd>, PD, EVEX_CD8<32, CD8VH>; +defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui, + X86cvttp2uiRnd>, PD, EVEX_CD8<32, CD8VH>; defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, X86VSintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>; @@ -5736,45 +6593,151 @@ defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, X86VUintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>; -defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, +defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP, X86VSintToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>; -defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, +defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP, X86VUintToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>; let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr - (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr - (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))), (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr - (v8f64 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_xmm)>; + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_xmm)>; + +def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))), + (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src, sub_xmm)))), sub_xmm)>; def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr - (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr - (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr - (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>; + (v8i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_ymm)>; + +def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr + (v8i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; +} + +let Predicates = [HasAVX512, HasVLX] in { + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))), + (VCVTPD2DQZ128rr VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))))), + (VCVTPD2UDQZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))), + (VCVTTPD2DQZ128rr VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))))), + (VCVTTPD2UDQZ128rr VR128X:$src)>; + } } let Predicates = [HasAVX512] in { - def : Pat<(v8f32 (fround (loadv8f64 addr:$src))), + def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), (VCVTPD2PSZrm addr:$src)>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; } +let Predicates = [HasDQI, HasVLX] in { + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))), + (VCVTQQ2PSZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))), + (VCVTUQQ2PSZ128rr VR128X:$src)>; + } +} + +let Predicates = [HasDQI, NoVLX] in { +def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr + (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_ymm)>; + +def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr + (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_ymm)>; + +def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_xmm)>; + +def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_xmm)>; + +def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; +} + //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// @@ -5816,14 +6779,13 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph", "$src2, $src1", "$src1, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), - (i32 FROUND_CURRENT)), - NoItinerary, 0, X86select>, AVX512AIi8Base; + (i32 imm:$src2)), + NoItinerary, 0, 0, X86select>, AVX512AIi8Base; def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), (i32 FROUND_CURRENT) )), + (i32 imm:$src2))), addr:$dst)]>; let hasSideEffects = 0, mayStore = 1 in def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), @@ -5832,13 +6794,12 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, []>, EVEX_K; } multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { - defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + let hasSideEffects = 0 in + defm rb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest, + (outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", - (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2), - (i32 FROUND_NO_EXC)), - NoItinerary, 0, X86select>, EVEX_B, AVX512AIi8Base; + []>, EVEX_B, AVX512AIi8Base; } let Predicates = [HasAVX512] in { defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>, @@ -5852,25 +6813,72 @@ let Predicates = [HasAVX512] in { } } +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasVLX] in { + // Use MXCSR.RC for rounding instead of explicitly specifying the default + // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the + // configurations we support (the default). However, falling back to MXCSR is + // more consistent with other instructions, which are always controlled by it. + // It's encoded as 0b100. + def : Pat<(fp_to_f16 FR32X:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (VCVTPS2PHZ128rr + (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), sub_16bit))>; + + def : Pat<(f16_to_fp GR16:$src), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr + (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), FR32X)) >; + + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr + (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >; +} + +// Patterns for matching float to half-float conversion when AVX512 is supported +// but F16C isn't. In that case we have to use 512-bit vectors. +let Predicates = [HasAVX512, NoVLX, NoF16C] in { + def : Pat<(fp_to_f16 FR32X:$src), + (i16 (EXTRACT_SUBREG + (VMOVPDI2DIZrr + (v8i16 (EXTRACT_SUBREG + (VCVTPS2PHZrr + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), + sub_xmm), 4), sub_xmm))), sub_16bit))>; + + def : Pat<(f16_to_fp GR16:$src), + (f32 (COPY_TO_REGCLASS + (v4f32 (EXTRACT_SUBREG + (VCVTPH2PSZrr + (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), + (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), + sub_xmm)), sub_xmm)), FR32X))>; + + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))), + (f32 (COPY_TO_REGCLASS + (v4f32 (EXTRACT_SUBREG + (VCVTPH2PSZrr + (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), + sub_xmm), 4)), sub_xmm)), FR32X))>; +} + // Unordered/Ordered scalar fp compare with Sea and set EFLAGS -multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode, +multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), - [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, - (i32 FROUND_NO_EXC)))], - IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, + [], IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[WriteFAdd]>; } let Defs = [EFLAGS], Predicates = [HasAVX512] in { - defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">, + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss">, AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; - defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">, + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd">, AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; - defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">, + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss">, AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; - defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">, + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd">, AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; } @@ -5890,18 +6898,18 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } let isCodeGenOnly = 1 in { - defm Int_VUCOMISSZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem, - load, "ucomiss">, PS, EVEX, VEX_LIG, + defm Int_VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem, + sse_load_f32, "ucomiss">, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm Int_VUCOMISDZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem, - load, "ucomisd">, PD, EVEX, + defm Int_VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem, + sse_load_f64, "ucomisd">, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; - defm Int_VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem, - load, "comiss">, PS, EVEX, VEX_LIG, + defm Int_VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem, + sse_load_f32, "comiss">, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm Int_VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem, - load, "comisd">, PD, EVEX, + defm Int_VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem, + sse_load_f64, "comisd">, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } } @@ -6275,7 +7283,7 @@ defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, X86MemOperand x86memop> { - + let ExeDomain = DestInfo.ExeDomain in defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1", (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>, @@ -6301,7 +7309,8 @@ multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, DestInfo.KRCWM:$mask , SrcInfo.RC:$src1)>; - let mayStore = 1, mayLoad = 1, hasSideEffects = 0 in { + let mayStore = 1, mayLoad = 1, hasSideEffects = 0, + ExeDomain = DestInfo.ExeDomain in { def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, SrcInfo.RC:$src), OpcodeStr # "\t{$src, $dst|$dst, $src}", @@ -6328,23 +7337,6 @@ multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo, addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; } -multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo, - X86VectorVTInfo DestInfo, string sat > { - - def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# - DestInfo.Suffix#"_mem_"#SrcInfo.Size) - addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask), - (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr, - (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM), - (SrcInfo.VT SrcInfo.RC:$src))>; - - def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# - DestInfo.Suffix#"_mem_"#SrcInfo.Size) - addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1), - (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr, - (SrcInfo.VT SrcInfo.RC:$src))>; -} - multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, @@ -6370,140 +7362,111 @@ multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode, truncFrag, mtruncFrag>, EVEX_V512; } -multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, - X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, - X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, - X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{ - - let Predicates = [HasVLX, prd] in { - defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, - DestInfoZ128, x86memopZ128>, - avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128, - sat>, EVEX_V128; - - defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, - DestInfoZ256, x86memopZ256>, - avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256, - sat>, EVEX_V256; - } - let Predicates = [prd] in - defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, - DestInfoZ, x86memopZ>, - avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ, - sat>, EVEX_V512; -} - -multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode, + PatFrag StoreNode, PatFrag MaskedStoreNode> { defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, - truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>; -} -multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> { - defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info, - v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, - sat>, EVEX_CD8<8, CD8VO>; + StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>; } -multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode, + PatFrag StoreNode, PatFrag MaskedStoreNode> { defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, - truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>; -} -multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> { - defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info, - v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, - sat>, EVEX_CD8<16, CD8VQ>; + StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>; } -multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode, + PatFrag StoreNode, PatFrag MaskedStoreNode> { defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, - truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>; -} -multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> { - defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info, - v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, - sat>, EVEX_CD8<32, CD8VH>; + StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>; } -multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode, + PatFrag StoreNode, PatFrag MaskedStoreNode> { defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, - truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>; -} -multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> { - defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info, - v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, - sat>, EVEX_CD8<8, CD8VQ>; + StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>; } -multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode, + PatFrag StoreNode, PatFrag MaskedStoreNode> { defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, - truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>; -} -multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> { - defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info, - v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, - sat>, EVEX_CD8<16, CD8VH>; + StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>; } -multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode, + PatFrag StoreNode, PatFrag MaskedStoreNode> { defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info, v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, - truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>; -} -multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> { - defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info, - v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, - sat, HasBWI>, EVEX_CD8<16, CD8VH>; -} - -defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>; -defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>; -defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>; - -defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>; -defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>; -defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>; - -defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>; -defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>; -defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>; - -defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>; -defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>; -defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>; - -defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>; -defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>; -defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>; - -defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>; -defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>; -defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>; + StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>; +} + +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc, + truncstorevi8, masked_truncstorevi8>; +defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, + truncstore_s_vi8, masked_truncstore_s_vi8>; +defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, + truncstore_us_vi8, masked_truncstore_us_vi8>; + +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc, + truncstorevi16, masked_truncstorevi16>; +defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, + truncstore_s_vi16, masked_truncstore_s_vi16>; +defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, + truncstore_us_vi16, masked_truncstore_us_vi16>; + +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc, + truncstorevi32, masked_truncstorevi32>; +defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, + truncstore_s_vi32, masked_truncstore_s_vi32>; +defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, + truncstore_us_vi32, masked_truncstore_us_vi32>; + +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc, + truncstorevi8, masked_truncstorevi8>; +defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, + truncstore_s_vi8, masked_truncstore_s_vi8>; +defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, + truncstore_us_vi8, masked_truncstore_us_vi8>; + +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc, + truncstorevi16, masked_truncstorevi16>; +defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, + truncstore_s_vi16, masked_truncstore_s_vi16>; +defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, + truncstore_us_vi16, masked_truncstore_us_vi16>; + +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc, + truncstorevi8, masked_truncstorevi8>; +defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, + truncstore_s_vi8, masked_truncstore_s_vi8>; +defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, + truncstore_us_vi8, masked_truncstore_us_vi8>; let Predicates = [HasAVX512, NoVLX] in { def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))), (v8i16 (EXTRACT_SUBREG - (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0), + (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src, sub_ymm)))), sub_xmm))>; def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))), (v4i32 (EXTRACT_SUBREG - (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0), + (v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src, sub_ymm)))), sub_xmm))>; } let Predicates = [HasBWI, NoVLX] in { def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))), - (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0), + (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src, sub_ymm))), sub_xmm))>; } multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{ + let ExeDomain = DestInfo.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src", (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>, @@ -6513,6 +7476,7 @@ multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, (ins x86memop:$src), OpcodeStr ,"$src", "$src", (DestInfo.VT (LdFrag addr:$src))>, EVEX; + } } multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, @@ -6685,6 +7649,150 @@ let Predicates = [HasAVX512] in { defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>; } +multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy, + SDNode ExtOp, PatFrag ExtLoad16> { + // 128-bit patterns + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + } + let Predicates = [HasVLX] in { + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), + (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + } + // 256-bit patterns + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; + } + let Predicates = [HasVLX] in { + def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; + } + // 512-bit patterns + let Predicates = [HasBWI] in { + def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZrm) addr:$src)>; + } + let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; + def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; + + def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZrm) addr:$src)>; + } +} + +defm : AVX512_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; +defm : AVX512_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; + //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations @@ -6859,8 +7967,14 @@ defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; // Helper fragments to match sext vXi1 to vXiY. -def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; -def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; +def v64i1sextv64i8 : PatLeaf<(v64i8 + (X86vsext + (v64i1 (X86pcmpgtm + (bc_v64i8 (v16i32 immAllZerosV)), + VR512:$src))))>; +def v32i1sextv32i16 : PatLeaf<(v32i16 (X86vsrai VR512:$src, (i8 15)))>; +def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; +def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), @@ -6941,7 +8055,7 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", // AVX-512 - COMPRESS and EXPAND // -multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, +multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", @@ -6956,19 +8070,28 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, def mrk : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", - [(store (_.VT (vselect _.KRCWM:$mask, - (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)), - addr:$dst)]>, + []>, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; } +multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > { + + def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask, + (_.VT _.RC:$src)), + (!cast<Instruction>(NAME#_.ZSuffix##mrk) + addr:$dst, _.KRCWM:$mask, _.RC:$src)>; +} + multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo VTInfo> { - defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr>, + compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; - defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr>, + compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256; + defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr>, + compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128; } } @@ -6995,13 +8118,28 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>; } +multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > { + + def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) + _.KRCWM:$mask, addr:$src)>; + + def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, + (_.VT _.RC:$src0))), + (!cast<Instruction>(NAME#_.ZSuffix##rmk) + _.RC:$src0, _.KRCWM:$mask, addr:$src)>; +} + multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo VTInfo> { - defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, + expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; - defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, + expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256; + defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, + expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128; } } @@ -7019,7 +8157,8 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, // op(broadcast(eltVt),imm) //all instruction created with FROUND_CURRENT multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", @@ -7039,11 +8178,13 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), (i32 imm:$src2), (i32 FROUND_CURRENT))>, EVEX_B; + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, {sae}, $src1", @@ -7073,7 +8214,8 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, // op(reg_vec2,broadcast(eltVt),imm) //all instruction created with FROUND_CURRENT multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -7096,13 +8238,14 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), (i32 imm:$src3), (i32 FROUND_CURRENT))>, EVEX_B; + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{ - + let ExeDomain = DestInfo.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -7116,6 +8259,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, (SrcInfo.VT (bitconvert (SrcInfo.LdFrag addr:$src2))), (i8 imm:$src3)))>; + } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) @@ -7125,6 +8269,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>: avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{ + let ExeDomain = _.ExeDomain in defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", @@ -7138,8 +8283,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, // op(reg_vec2,mem_scalar,imm) //all instruction created with FROUND_CURRENT multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { - + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -7148,25 +8293,20 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, (i32 imm:$src3), (i32 FROUND_CURRENT))>; defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), + (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), (i32 imm:$src3), (i32 FROUND_CURRENT))>; - - let isAsmParserOnly = 1, mayLoad = 1, hasSideEffects = 0 in { - defm rmi_alt :AVX512_maskable_in_asm<opc, MRMSrcMem, _, (outs _.FRC:$dst), - (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), - OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - []>; } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", @@ -7439,14 +8579,64 @@ multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>; +def avx512_v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), + VR128X:$src))>; +def avx512_v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128X:$src, (i8 15)))>; +def avx512_v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128X:$src, (i8 31)))>; +def avx512_v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), + VR256X:$src))>; +def avx512_v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256X:$src, (i8 15)))>; +def avx512_v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256X:$src, (i8 31)))>; + +let Predicates = [HasBWI, HasVLX] in { + def : Pat<(xor + (bc_v2i64 (avx512_v16i1sextv16i8)), + (bc_v2i64 (add (v16i8 VR128X:$src), (avx512_v16i1sextv16i8)))), + (VPABSBZ128rr VR128X:$src)>; + def : Pat<(xor + (bc_v2i64 (avx512_v8i1sextv8i16)), + (bc_v2i64 (add (v8i16 VR128X:$src), (avx512_v8i1sextv8i16)))), + (VPABSWZ128rr VR128X:$src)>; + def : Pat<(xor + (bc_v4i64 (avx512_v32i1sextv32i8)), + (bc_v4i64 (add (v32i8 VR256X:$src), (avx512_v32i1sextv32i8)))), + (VPABSBZ256rr VR256X:$src)>; + def : Pat<(xor + (bc_v4i64 (avx512_v16i1sextv16i16)), + (bc_v4i64 (add (v16i16 VR256X:$src), (avx512_v16i1sextv16i16)))), + (VPABSWZ256rr VR256X:$src)>; +} +let Predicates = [HasAVX512, HasVLX] in { + def : Pat<(xor + (bc_v2i64 (avx512_v4i1sextv4i32)), + (bc_v2i64 (add (v4i32 VR128X:$src), (avx512_v4i1sextv4i32)))), + (VPABSDZ128rr VR128X:$src)>; + def : Pat<(xor + (bc_v4i64 (avx512_v8i1sextv8i32)), + (bc_v4i64 (add (v8i32 VR256X:$src), (avx512_v8i1sextv8i32)))), + (VPABSDZ256rr VR256X:$src)>; +} + +let Predicates = [HasAVX512] in { def : Pat<(xor - (bc_v16i32 (v16i1sextv16i32)), - (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))), + (bc_v8i64 (v16i1sextv16i32)), + (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))), (VPABSDZrr VR512:$src)>; def : Pat<(xor (bc_v8i64 (v8i1sextv8i64)), (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), (VPABSQZrr VR512:$src)>; +} +let Predicates = [HasBWI] in { +def : Pat<(xor + (bc_v8i64 (v64i1sextv64i8)), + (bc_v8i64 (add (v64i8 VR512:$src), (v64i1sextv64i8)))), + (VPABSBZrr VR512:$src)>; +def : Pat<(xor + (bc_v8i64 (v32i1sextv32i16)), + (bc_v8i64 (add (v32i16 VR512:$src), (v32i1sextv32i16)))), + (VPABSWZrr VR512:$src)>; +} multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ @@ -7503,16 +8693,44 @@ multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{ defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; +let Predicates = [HasVLX] in { def : Pat<(X86Movddup (loadv2f64 addr:$src)), - (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; + (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), - (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; + (VMOVDDUPZ128rm addr:$src)>; +def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask, + (COPY_TO_REGCLASS FR64X:$src, VR128X))>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; +} //===----------------------------------------------------------------------===// // AVX-512 - Unpack Instructions //===----------------------------------------------------------------------===// -defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512>; -defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512>; +defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512, + SSE_ALU_ITINS_S>; +defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512, + SSE_ALU_ITINS_S>; defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl, SSE_INTALU_ITINS_P, HasBWI>; @@ -7730,22 +8948,22 @@ defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", HasBWI>, EVEX_4V; multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ - let Constraints = "$src1 = $dst" in { + X86VectorVTInfo _>{ + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, u8imm:$src4), OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT _.RC:$src3), - (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V; + (i8 imm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT (bitconvert (_.LdFrag addr:$src3))), - (i8 imm:$src4))>, + (i8 imm:$src4)), 1, 0>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4), @@ -7754,7 +8972,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - (i8 imm:$src4))>, EVEX_B, + (i8 imm:$src4)), 1, 0>, EVEX_B, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; }// Constraints = "$src1 = $dst" } @@ -7776,8 +8994,8 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ - let Constraints = "$src1 = $dst" in { + X86VectorVTInfo _>{ + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -7807,8 +9025,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _>{ -let Constraints = "$src1 = $dst" in { + SDNode OpNode, X86VectorVTInfo _>{ +let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", @@ -7823,7 +9041,8 @@ let Constraints = "$src1 = $dst" in { multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, X86VectorVTInfo _src3VT> { - let Constraints = "$src1 = $dst" , Predicates = [HasAVX512] in { + let Constraints = "$src1 = $dst" , Predicates = [HasAVX512], + ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -7877,3 +9096,135 @@ defm VFIXUPIMMPS : avx512_fixupimm_packed_all<avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; defm VFIXUPIMMPD : avx512_fixupimm_packed_all<avx512vl_f64_info>, EVEX_CD8<64, CD8VF>, VEX_W; + + + +// Patterns used to select SSE scalar fp arithmetic instructions from +// either: +// +// (1) a scalar fp operation followed by a blend +// +// The effect is that the backend no longer emits unnecessary vector +// insert instructions immediately after SSE scalar fp instructions +// like addss or mulss. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// A[0] += B[0]; +// return A; +// } +// +// Previously we generated: +// addss %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// We now generate: +// addss %xmm1, %xmm0 +// +// (2) a vector packed single/double fp operation followed by a vector insert +// +// The effect is that the backend converts the packed fp instruction +// followed by a vector insert into a single SSE scalar fp instruction. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// __m128 C = A + B; +// return (__m128) {c[0], a[1], a[2], a[3]}; +// } +// +// Previously we generated: +// addps %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// We now generate: +// addss %xmm1, %xmm0 + +// TODO: Some canonicalization in lowering would simplify the number of +// patterns we have to try to match. +multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { + let Predicates = [HasAVX512] in { + // extracted scalar math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector + (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))), + FR32X:$src))))), + (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32X:$src, VR128X))>; + + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector + (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))), + FR32X:$src))), (i8 1))), + (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32X:$src, VR128X))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), + (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))), + (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>; + + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), + (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)), (i8 1))), + (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>; + + // extracted masked scalar math op with insert via movss + def : Pat<(X86Movss (v4f32 VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))), + FR32X:$src2), + FR32X:$src0))), + (!cast<I>("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X), + VK1WM:$mask, v4f32:$src1, + (COPY_TO_REGCLASS FR32X:$src2, VR128X))>; + } +} + +defm : AVX512_scalar_math_f32_patterns<fadd, "ADD">; +defm : AVX512_scalar_math_f32_patterns<fsub, "SUB">; +defm : AVX512_scalar_math_f32_patterns<fmul, "MUL">; +defm : AVX512_scalar_math_f32_patterns<fdiv, "DIV">; + +multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { + let Predicates = [HasAVX512] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector + (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))), + FR64X:$src))))), + (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector + (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))), + FR64X:$src))), (i8 1))), + (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), + (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))), + (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), + (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)), (i8 1))), + (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>; + + // extracted masked scalar math op with insert via movss + def : Pat<(X86Movsd (v2f64 VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))), + FR64X:$src2), + FR64X:$src0))), + (!cast<I>("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X), + VK1WM:$mask, v2f64:$src1, + (COPY_TO_REGCLASS FR64X:$src2, VR128X))>; + } +} + +defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">; +defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">; +defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">; +defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">; diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td index 1a2e786..bfd21c0 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -625,7 +625,7 @@ def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su, 1, OpSize32, 0>; def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, - Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8, + Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su, 1, OpSizeFixed, 1>; /// ITy - This instruction base class takes the type info for the instruction. diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h index bcea6fa..ba970bc 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -24,9 +24,15 @@ #ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H #define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCInstrDesc.h" +#include <cassert> namespace llvm { @@ -57,12 +63,11 @@ struct X86AddressMode { Base.Reg = 0; } - void getFullAddress(SmallVectorImpl<MachineOperand> &MO) { assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8); if (BaseType == X86AddressMode::RegBase) - MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, + MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, false, false, false, false, 0, false)); else { assert(BaseType == X86AddressMode::FrameIndexBase); @@ -70,44 +75,45 @@ struct X86AddressMode { } MO.push_back(MachineOperand::CreateImm(Scale)); - MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, - false, false, false, 0, false)); + MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, false, false, + false, false, 0, false)); if (GV) MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags)); else MO.push_back(MachineOperand::CreateImm(Disp)); - MO.push_back(MachineOperand::CreateReg(0, false, false, - false, false, false, 0, false)); + MO.push_back(MachineOperand::CreateReg(0, false, false, false, false, false, + false, 0, false)); } }; /// Compute the addressing mode from an machine instruction starting with the /// given operand. -static inline X86AddressMode getAddressFromInstr(MachineInstr *MI, +static inline X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand) { X86AddressMode AM; - MachineOperand &Op = MI->getOperand(Operand); - if (Op.isReg()) { + const MachineOperand &Op0 = MI->getOperand(Operand); + if (Op0.isReg()) { AM.BaseType = X86AddressMode::RegBase; - AM.Base.Reg = Op.getReg(); + AM.Base.Reg = Op0.getReg(); } else { AM.BaseType = X86AddressMode::FrameIndexBase; - AM.Base.FrameIndex = Op.getIndex(); - } - Op = MI->getOperand(Operand + 1); - if (Op.isImm()) - AM.Scale = Op.getImm(); - Op = MI->getOperand(Operand + 2); - if (Op.isImm()) - AM.IndexReg = Op.getImm(); - Op = MI->getOperand(Operand + 3); - if (Op.isGlobal()) { - AM.GV = Op.getGlobal(); - } else { - AM.Disp = Op.getImm(); + AM.Base.FrameIndex = Op0.getIndex(); } + + const MachineOperand &Op1 = MI->getOperand(Operand + 1); + AM.Scale = Op1.getImm(); + + const MachineOperand &Op2 = MI->getOperand(Operand + 2); + AM.IndexReg = Op2.getReg(); + + const MachineOperand &Op3 = MI->getOperand(Operand + 3); + if (Op3.isGlobal()) + AM.GV = Op3.getGlobal(); + else + AM.Disp = Op3.getImm(); + return AM; } @@ -122,12 +128,28 @@ addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) { return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0); } +/// Replace the address used in the instruction with the direct memory +/// reference. +static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, + unsigned Reg) { + // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg. + MI->getOperand(Operand).setReg(Reg); + MI->getOperand(Operand + 1).setImm(1); + MI->getOperand(Operand + 2).setReg(0); + MI->getOperand(Operand + 3).setImm(0); + MI->getOperand(Operand + 4).setReg(0); +} static inline const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset) { return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0); } +static inline const MachineInstrBuilder & +addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) { + return MIB.addImm(1).addReg(0).addOperand(Offset).addReg(0); +} + /// addRegOffset - This function is used to add a memory reference of the form /// [Reg + Offset], i.e., one with no scale or index, but with a /// displacement. An example is: DWORD PTR [EAX + 4]. @@ -177,7 +199,7 @@ static inline const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { MachineInstr *MI = MIB; MachineFunction &MF = *MI->getParent()->getParent(); - MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); const MCInstrDesc &MCID = MI->getDesc(); auto Flags = MachineMemOperand::MONone; if (MCID.mayLoad()) @@ -206,6 +228,6 @@ addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0); } -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index 925f4ef..3c27eb8 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -723,7 +723,7 @@ defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, -1, "dec">; multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic, SDPatternOperator frag, X86MemOperand x86memop, InstrItinClass itin> { -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, usesCustomInserter = 1 in { def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr), !strconcat(mnemonic, "\t$ptr"), [(frag addr:$ptr)], itin>, TB, LOCK; @@ -1025,53 +1025,6 @@ def : Pat<(store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>; def : Pat<(store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>; } -// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable -def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>; -def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>; -def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>; -def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; -def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; -def : Pat<(i32 (X86Wrapper mcsym:$dst)), (MOV32ri mcsym:$dst)>; -def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>; - -def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), - (ADD32ri GR32:$src1, tconstpool:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)), - (ADD32ri GR32:$src1, tjumptable:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), - (ADD32ri GR32:$src1, tglobaladdr:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), - (ADD32ri GR32:$src1, texternalsym:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper mcsym:$src2)), - (ADD32ri GR32:$src1, mcsym:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)), - (ADD32ri GR32:$src1, tblockaddress:$src2)>; - -def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), - (MOV32mi addr:$dst, tglobaladdr:$src)>; -def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), - (MOV32mi addr:$dst, texternalsym:$src)>; -def : Pat<(store (i32 (X86Wrapper mcsym:$src)), addr:$dst), - (MOV32mi addr:$dst, mcsym:$src)>; -def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), - (MOV32mi addr:$dst, tblockaddress:$src)>; - -// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small -// code model mode, should use 'movabs'. FIXME: This is really a hack, the -// 'movabs' predicate should handle this sort of thing. -def : Pat<(i64 (X86Wrapper tconstpool :$dst)), - (MOV64ri tconstpool :$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper tjumptable :$dst)), - (MOV64ri tjumptable :$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), - (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper texternalsym:$dst)), - (MOV64ri texternalsym:$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper mcsym:$dst)), - (MOV64ri mcsym:$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), - (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; - // In kernel code model, we can get the address of a label // into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of // the MOV64ri32 should accept these. @@ -1289,15 +1242,13 @@ def : Pat<(i64 (anyext GR32:$src)), // Any instruction that defines a 32-bit result leaves the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may -// be copying from a truncate. And x86's cmov doesn't do anything if the -// condition is false. But any other 32-bit operation will zero-extend +// be copying from a truncate. Any other 32-bit operation will zero-extend // up to 64 bits. def def32 : PatLeaf<(i32 GR32:$src), [{ return N->getOpcode() != ISD::TRUNCATE && N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && N->getOpcode() != ISD::CopyFromReg && - N->getOpcode() != ISD::AssertSext && - N->getOpcode() != X86ISD::CMOV; + N->getOpcode() != ISD::AssertSext; }]>; // In the case of a 32-bit def that is known to implicitly zero-extend, @@ -1711,6 +1662,22 @@ defm : MaskedShiftAmountPats<sra, "SAR">; defm : MaskedShiftAmountPats<rotl, "ROL">; defm : MaskedShiftAmountPats<rotr, "ROR">; +// Double shift amount is implicitly masked. +multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> { + // (shift x (and y, 31)) ==> (shift x, y) + def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)), + (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>; + def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)), + (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>; + + // (shift x (and y, 63)) ==> (shift x, y) + def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)), + (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>; +} + +defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">; +defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">; + // (anyext (setcc_carry)) -> (setcc_carry) def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C16r)>; @@ -1719,9 +1686,6 @@ def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C32r)>; - - - //===----------------------------------------------------------------------===// // EFLAGS-defining Patterns //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index bb5f911..2f260c4 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -239,7 +239,6 @@ let isCall = 1 in // Tail call stuff. - let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in let Uses = [ESP] in { @@ -257,6 +256,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, (ins i32imm_pcrel:$dst), "jmp\t$dst", [], IIC_JMP_REL>; + def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. let mayLoad = 1 in @@ -296,17 +296,18 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1, SchedRW = [WriteJump] in { - def TCRETURNdi64 : PseudoI<(outs), - (ins i64i32imm_pcrel:$dst, i32imm:$offset), - []>; - def TCRETURNri64 : PseudoI<(outs), - (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; + def TCRETURNdi64 : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$offset), + []>; + def TCRETURNri64 : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; let mayLoad = 1 in - def TCRETURNmi64 : PseudoI<(outs), - (ins i64mem_TC:$dst, i32imm:$offset), []>; + def TCRETURNmi64 : PseudoI<(outs), + (ins i64mem_TC:$dst, i32imm:$offset), []>; def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), "jmp\t$dst", [], IIC_JMP_REL>; + def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; @@ -314,11 +315,8 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; - // Win64 wants jumps leaving the function to have a REX_W prefix. + // Win64 wants indirect jumps leaving the function to have a REX_W prefix. let hasREX_WPrefix = 1 in { - def TAILJMPd64_REX : Ii32PCRel<0xE9, RawFrm, (outs), - (ins i64i32imm_pcrel:$dst), - "rex64 jmp\t$dst", [], IIC_JMP_REL>; def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td index fd800cf..4b19f80 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -39,7 +39,6 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, PatFrag MemFrag128, PatFrag MemFrag256, ValueType OpVT128, ValueType OpVT256, SDPatternOperator Op = null_frag> { - let usesCustomInserter = 1 in def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -55,8 +54,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, (MemFrag128 addr:$src3))))]>; - let usesCustomInserter = 1 in - def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), + def Yr : FMA3<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -64,7 +62,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, VR256:$src3)))]>, VEX_L; let mayLoad = 1 in - def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), + def Ym : FMA3<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -74,60 +72,61 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, } multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpcodeStr, string PackTy, + string OpcodeStr, string PackTy, string Suff, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - defm r213 : fma3p_rm<opc213, - !strconcat(OpcodeStr, "213", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; - defm r132 : fma3p_rm<opc132, - !strconcat(OpcodeStr, "132", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256>; - defm r231 : fma3p_rm<opc231, - !strconcat(OpcodeStr, "231", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256>; + defm NAME#213#Suff : fma3p_rm<opc213, + !strconcat(OpcodeStr, "213", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; + defm NAME#132#Suff : fma3p_rm<opc132, + !strconcat(OpcodeStr, "132", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256>; + defm NAME#231#Suff : fma3p_rm<opc231, + !strconcat(OpcodeStr, "231", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256>; } // Fused Multiply-Add let ExeDomain = SSEPackedSingle in { - defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32, - loadv8f32, X86Fmadd, v4f32, v8f32>; - defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32, - loadv8f32, X86Fmsub, v4f32, v8f32>; - defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", - loadv4f32, loadv8f32, X86Fmaddsub, - v4f32, v8f32>; - defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", - loadv4f32, loadv8f32, X86Fmsubadd, - v4f32, v8f32>; + defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS", + loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32>; + defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", + loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>; + defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS", + loadv4f32, loadv8f32, X86Fmaddsub, + v4f32, v8f32>; + defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS", + loadv4f32, loadv8f32, X86Fmsubadd, + v4f32, v8f32>; } let ExeDomain = SSEPackedDouble in { - defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64, - loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W; - defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64, - loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W; - defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", - loadv2f64, loadv4f64, X86Fmaddsub, - v2f64, v4f64>, VEX_W; - defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", - loadv2f64, loadv4f64, X86Fmsubadd, - v2f64, v4f64>, VEX_W; + defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD", + loadv2f64, loadv4f64, X86Fmadd, v2f64, + v4f64>, VEX_W; + defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD", + loadv2f64, loadv4f64, X86Fmsub, v2f64, + v4f64>, VEX_W; + defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD", + loadv2f64, loadv4f64, X86Fmaddsub, + v2f64, v4f64>, VEX_W; + defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD", + loadv2f64, loadv4f64, X86Fmsubadd, + v2f64, v4f64>, VEX_W; } // Fused Negative Multiply-Add let ExeDomain = SSEPackedSingle in { - defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", loadv4f32, - loadv8f32, X86Fnmadd, v4f32, v8f32>; - defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", loadv4f32, - loadv8f32, X86Fnmsub, v4f32, v8f32>; + defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32, + loadv8f32, X86Fnmadd, v4f32, v8f32>; + defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32, + loadv8f32, X86Fnmsub, v4f32, v8f32>; } let ExeDomain = SSEPackedDouble in { - defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64, - loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W; - defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", - loadv2f64, loadv4f64, X86Fnmsub, v2f64, - v4f64>, VEX_W; + defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64, + loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W; + defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64, + loadv4f64, X86Fnmsub, v2f64, v4f64>, VEX_W; } // All source register operands of FMA opcodes defined in fma3s_rm multiclass @@ -143,7 +142,6 @@ let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, SDPatternOperator OpNode = null_frag> { - let usesCustomInserter = 1 in def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, @@ -191,13 +189,15 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, } multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpStr, string PackTy, + string OpStr, string PackTy, string Suff, SDNode OpNode, RegisterClass RC, X86MemOperand x86memop> { - defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>; - defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC, - OpNode>; - defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>; + defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), + x86memop, RC>; + defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), + x86memop, RC, OpNode>; + defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), + x86memop, RC>; } // The FMA 213 form is created for lowering of scalar FMA intrinscis @@ -210,42 +210,45 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, // form of FMA*_Int instructions is done using an optimistic assumption that // such analysis will be implemented eventually. multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpStr, string PackTy, + string OpStr, string PackTy, string Suff, RegisterClass RC, Operand memop> { - defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy), - memop, RC>; - defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy), - memop, RC>; - defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy), - memop, RC>; + defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy), + memop, RC>; + defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy), + memop, RC>; + defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy), + memop, RC>; } multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpStr, Intrinsic IntF32, Intrinsic IntF64, SDNode OpNode> { let ExeDomain = SSEPackedSingle in - defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode, - FR32, f32mem>, - fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>; + defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode, + FR32, f32mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS", + VR128, ssmem>; let ExeDomain = SSEPackedDouble in - defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode, + defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode, FR64, f64mem>, - fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>, - VEX_W; + fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD", + VR128, sdmem>, VEX_W; // These patterns use the 123 ordering, instead of 213, even though // they match the intrinsic to the 213 version of the instruction. // This is because src1 is tied to dest, and the scalar intrinsics // require the pass-through values to come from the first source // operand, not the second. - def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int") - $src1, $src2, $src3), VR128)>; + let Predicates = [HasFMA] in { + def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SSr_Int") + $src1, $src2, $src3), VR128)>; - def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int") - $src1, $src2, $src3), VR128)>; + def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SDr_Int") + $src1, $src2, $src3), VR128)>; + } } defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, @@ -268,18 +271,18 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, PatFrag mem_frag> { let isCommutable = 1 in - def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst), + def rr : FMA4<opc, MRMSrcRegOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4; - def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG; + def rm : FMA4<opc, MRMSrcMemOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4; + (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG; def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, @@ -298,19 +301,18 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, ComplexPattern mem_cpat, Intrinsic Int> { let isCodeGenOnly = 1 in { - let isCommutable = 1 in - def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4; - def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG; + def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, - mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4; + mem_cpat:$src3))]>, VEX_W, VEX_LIG; def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -324,19 +326,19 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT128, ValueType OpVT256, PatFrag ld_frag128, PatFrag ld_frag256> { let isCommutable = 1 in - def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, - VEX_W, MemOp4; - def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + VEX_W; + def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2, - (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4; + (ld_frag128 addr:$src3)))]>, VEX_W; def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -344,20 +346,20 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set VR128:$dst, (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; let isCommutable = 1 in - def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, - VEX_W, MemOp4, VEX_L; - def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + VEX_W, VEX_L; + def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, - (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L; - def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L; + def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -369,7 +371,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; - def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp new file mode 100644 index 0000000..db83497 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -0,0 +1,285 @@ +//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the classes providing information +// about existing X86 FMA3 opcodes, classifying and grouping them. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrFMA3Info.h" +#include "X86InstrInfo.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Threading.h" +using namespace llvm; + +/// This flag is used in the method llvm::call_once() used below to make the +/// initialization of the map 'OpcodeToGroup' thread safe. +LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag); + +static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj; +X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() { + return &*X86InstrFMA3InfoObj; +} + +void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes, + const uint16_t *MemOpcodes, unsigned Attr) { + // Create a new instance of this class that would hold a group of FMA opcodes. + X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr); + + // Add the references from indvidual opcodes to the group holding them. + assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] && + !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] && + !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) && + "Duplication or rewrite of elements in OpcodeToGroup."); + OpcodeToGroup[RegOpcodes[0]] = G; + OpcodeToGroup[RegOpcodes[1]] = G; + OpcodeToGroup[RegOpcodes[2]] = G; + OpcodeToGroup[MemOpcodes[0]] = G; + OpcodeToGroup[MemOpcodes[1]] = G; + OpcodeToGroup[MemOpcodes[2]] = G; +} + +void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) { + // Create a new instance of this class that would hold a group of FMA opcodes. + X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr); + + // Add the references from indvidual opcodes to the group holding them. + assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] && + !OpcodeToGroup[RegOpcodes[2]]) && + "Duplication or rewrite of elements in OpcodeToGroup."); + OpcodeToGroup[RegOpcodes[0]] = G; + OpcodeToGroup[RegOpcodes[1]] = G; + OpcodeToGroup[RegOpcodes[2]] = G; +} + +void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) { + // Create a new instance of this class that would hold a group of FMA opcodes. + X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr); + + // Add the references from indvidual opcodes to the group holding them. + assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] && + !OpcodeToGroup[MemOpcodes[2]]) && + "Duplication or rewrite of elements in OpcodeToGroup."); + OpcodeToGroup[MemOpcodes[0]] = G; + OpcodeToGroup[MemOpcodes[1]] = G; + OpcodeToGroup[MemOpcodes[2]] = G; +} + +#define FMA3RM(R132, R213, R231, M132, M213, M231) \ + static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \ + static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \ + initRMGroup(Reg##R132, Mem##R132); + +#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs) \ + static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \ + static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \ + initRMGroup(Reg##R132, Mem##R132, (Attrs)); + +#define FMA3R(R132, R213, R231) \ + static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \ + initRGroup(Reg##R132); + +#define FMA3RA(R132, R213, R231, Attrs) \ + static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \ + initRGroup(Reg##R132, (Attrs)); + +#define FMA3M(M132, M213, M231) \ + static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \ + initMGroup(Mem##M132); + +#define FMA3MA(M132, M213, M231, Attrs) \ + static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \ + initMGroup(Mem##M132, (Attrs)); + +#define FMA3_AVX2_VECTOR_GROUP(Name) \ + FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr, \ + Name##132PSm, Name##213PSm, Name##231PSm); \ + FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr, \ + Name##132PDm, Name##213PDm, Name##231PDm); \ + FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr, \ + Name##132PSYm, Name##213PSYm, Name##231PSYm); \ + FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr, \ + Name##132PDYm, Name##213PDYm, Name##231PDYm); + +#define FMA3_AVX2_SCALAR_GROUP(Name) \ + FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr, \ + Name##132SSm, Name##213SSm, Name##231SSm); \ + FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr, \ + Name##132SDm, Name##213SDm, Name##231SDm); \ + FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int, \ + Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int, \ + Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); + +#define FMA3_AVX2_FULL_GROUP(Name) \ + FMA3_AVX2_VECTOR_GROUP(Name); \ + FMA3_AVX2_SCALAR_GROUP(Name); + +#define FMA3_AVX512_VECTOR_GROUP(Name) \ + FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r, \ + Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m); \ + FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r, \ + Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m); \ + FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r, \ + Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m); \ + FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r, \ + Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m); \ + FMA3RM(Name##132PSZr, Name##213PSZr, Name##231PSZr, \ + Name##132PSZm, Name##213PSZm, Name##231PSZm); \ + FMA3RM(Name##132PDZr, Name##213PDZr, Name##231PDZr, \ + Name##132PDZm, Name##213PDZm, Name##231PDZm); \ + FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk, \ + Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk, \ + Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk, \ + Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk, \ + Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PSZrk, Name##213PSZrk, Name##231PSZrk, \ + Name##132PSZmk, Name##213PSZmk, Name##231PSZmk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PDZrk, Name##213PDZrk, Name##231PDZrk, \ + Name##132PDZmk, Name##213PDZmk, Name##231PDZmk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz, \ + Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz, \ + Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz, \ + Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz, \ + Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PSZrkz, Name##213PSZrkz, Name##231PSZrkz, \ + Name##132PSZmkz, Name##213PSZmkz, Name##231PSZmkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PDZrkz, Name##213PDZrkz, Name##231PDZrkz, \ + Name##132PDZmkz, Name##213PDZmkz, Name##231PDZmkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb); \ + FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb); \ + FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb); \ + FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb); \ + FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb); \ + FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb); \ + FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb); \ + FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb); \ + FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PSZmbk, Name##213PSZmbk, Name##231PSZmbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PDZmbk, Name##213PDZmbk, Name##231PDZmbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); + +#define FMA3_AVX512_SCALAR_GROUP(Name) \ + FMA3RM(Name##132SSZr, Name##213SSZr, Name##231SSZr, \ + Name##132SSZm, Name##213SSZm, Name##231SSZm); \ + FMA3RM(Name##132SDZr, Name##213SDZr, Name##231SDZr, \ + Name##132SDZm, Name##213SDZm, Name##231SDZm); \ + FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int, \ + Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int, \ + Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk, \ + Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk, \ + Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz, \ + Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz, \ + Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KZeroMasked); + +#define FMA3_AVX512_FULL_GROUP(Name) \ + FMA3_AVX512_VECTOR_GROUP(Name); \ + FMA3_AVX512_SCALAR_GROUP(Name); + +void X86InstrFMA3Info::initGroupsOnceImpl() { + FMA3_AVX2_FULL_GROUP(VFMADD); + FMA3_AVX2_FULL_GROUP(VFMSUB); + FMA3_AVX2_FULL_GROUP(VFNMADD); + FMA3_AVX2_FULL_GROUP(VFNMSUB); + + FMA3_AVX2_VECTOR_GROUP(VFMADDSUB); + FMA3_AVX2_VECTOR_GROUP(VFMSUBADD); + + FMA3_AVX512_FULL_GROUP(VFMADD); + FMA3_AVX512_FULL_GROUP(VFMSUB); + FMA3_AVX512_FULL_GROUP(VFNMADD); + FMA3_AVX512_FULL_GROUP(VFNMSUB); + + FMA3_AVX512_VECTOR_GROUP(VFMADDSUB); + FMA3_AVX512_VECTOR_GROUP(VFMSUBADD); +} + +void X86InstrFMA3Info::initGroupsOnce() { + llvm::call_once(InitGroupsOnceFlag, + []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); }); +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h new file mode 100644 index 0000000..025cee3 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h @@ -0,0 +1,315 @@ +//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the classes providing information +// about existing X86 FMA3 opcodes, classifying and grouping them. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H +#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H + +#include "X86.h" +#include "llvm/ADT/DenseMap.h" +#include <cassert> +#include <set> + +namespace llvm { +/// This class is used to group {132, 213, 231} forms of FMA opcodes together. +/// Each of the groups has either 3 register opcodes, 3 memory opcodes, +/// or 6 register and memory opcodes. Also, each group has an attrubutes field +/// describing it. +class X86InstrFMA3Group { +private: + /// Reference to an array holding 3 forms of register FMA opcodes. + /// It may be set to nullptr if the group of FMA opcodes does not have + /// any register form opcodes. + const uint16_t *RegOpcodes; + + /// Reference to an array holding 3 forms of memory FMA opcodes. + /// It may be set to nullptr if the group of FMA opcodes does not have + /// any register form opcodes. + const uint16_t *MemOpcodes; + + /// This bitfield specifies the attributes associated with the created + /// FMA groups of opcodes. + unsigned Attributes; + + static const unsigned Form132 = 0; + static const unsigned Form213 = 1; + static const unsigned Form231 = 2; + +public: + /// This bit must be set in the 'Attributes' field of FMA group if such + /// group of FMA opcodes consists of FMA intrinsic opcodes. + static const unsigned X86FMA3Intrinsic = 0x1; + + /// This bit must be set in the 'Attributes' field of FMA group if such + /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and + /// passing the elements from the 1st operand to the result of the operation + /// when the correpondings bits in the k-mask are unset. + static const unsigned X86FMA3KMergeMasked = 0x2; + + /// This bit must be set in the 'Attributes' field of FMA group if such + /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask. + static const unsigned X86FMA3KZeroMasked = 0x4; + + /// Constructor. Creates a new group of FMA opcodes with three register form + /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes. + /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr, + /// which means that the created group of FMA opcodes does not have the + /// corresponding (register or memory) opcodes. + /// The parameter \p Attr specifies the attributes describing the created + /// group. + X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes, + unsigned Attr) + : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) { + assert((RegOpcodes || MemOpcodes) && + "Cannot create a group not having any opcodes."); + } + + /// Returns a memory form opcode that is the equivalent of the given register + /// form opcode \p RegOpcode. 0 is returned if the group does not have + /// either register of memory opcodes. + unsigned getMemOpcode(unsigned RegOpcode) const { + if (!RegOpcodes || !MemOpcodes) + return 0; + for (unsigned Form = 0; Form < 3; Form++) + if (RegOpcodes[Form] == RegOpcode) + return MemOpcodes[Form]; + return 0; + } + + /// Returns the 132 form of FMA register opcode. + unsigned getReg132Opcode() const { + assert(RegOpcodes && "The group does not have register opcodes."); + return RegOpcodes[Form132]; + } + + /// Returns the 213 form of FMA register opcode. + unsigned getReg213Opcode() const { + assert(RegOpcodes && "The group does not have register opcodes."); + return RegOpcodes[Form213]; + } + + /// Returns the 231 form of FMA register opcode. + unsigned getReg231Opcode() const { + assert(RegOpcodes && "The group does not have register opcodes."); + return RegOpcodes[Form231]; + } + + /// Returns the 132 form of FMA memory opcode. + unsigned getMem132Opcode() const { + assert(MemOpcodes && "The group does not have memory opcodes."); + return MemOpcodes[Form132]; + } + + /// Returns the 213 form of FMA memory opcode. + unsigned getMem213Opcode() const { + assert(MemOpcodes && "The group does not have memory opcodes."); + return MemOpcodes[Form213]; + } + + /// Returns the 231 form of FMA memory opcode. + unsigned getMem231Opcode() const { + assert(MemOpcodes && "The group does not have memory opcodes."); + return MemOpcodes[Form231]; + } + + /// Returns true iff the group of FMA opcodes holds intrinsic opcodes. + bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; } + + /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes. + bool isKMergeMasked() const { + return (Attributes & X86FMA3KMergeMasked) != 0; + } + + /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes. + bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; } + + /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes. + bool isKMasked() const { + return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0; + } + + /// Returns true iff the given \p Opcode is a register opcode from the + /// groups of FMA opcodes. + bool isRegOpcodeFromGroup(unsigned Opcode) const { + if (!RegOpcodes) + return false; + for (unsigned Form = 0; Form < 3; Form++) + if (Opcode == RegOpcodes[Form]) + return true; + return false; + } + + /// Returns true iff the given \p Opcode is a memory opcode from the + /// groups of FMA opcodes. + bool isMemOpcodeFromGroup(unsigned Opcode) const { + if (!MemOpcodes) + return false; + for (unsigned Form = 0; Form < 3; Form++) + if (Opcode == MemOpcodes[Form]) + return true; + return false; + } +}; + +/// This class provides information about all existing FMA3 opcodes +/// +class X86InstrFMA3Info { +private: + /// A map that is used to find the group of FMA opcodes using any FMA opcode + /// from the group. + DenseMap<unsigned, const X86InstrFMA3Group *> OpcodeToGroup; + + /// Creates groups of FMA opcodes and initializes Opcode-to-Group map. + /// This method can be called many times, but the actual initialization is + /// called only once. + static void initGroupsOnce(); + + /// Creates groups of FMA opcodes and initializes Opcode-to-Group map. + /// This method must be called ONLY from initGroupsOnce(). Otherwise, such + /// call is not thread safe. + void initGroupsOnceImpl(); + + /// Creates one group of FMA opcodes having the register opcodes + /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr + /// specifies the attributes describing the created group. + void initRMGroup(const uint16_t *RegOpcodes, + const uint16_t *MemOpcodes, unsigned Attr = 0); + + /// Creates one group of FMA opcodes having only the register opcodes + /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing + /// the created group. + void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0); + + /// Creates one group of FMA opcodes having only the memory opcodes + /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing + /// the created group. + void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0); + +public: + /// Returns the reference to an object of this class. It is assumed that + /// only one object may exist. + static X86InstrFMA3Info *getX86InstrFMA3Info(); + + /// Constructor. Just creates an object of the class. + X86InstrFMA3Info() {} + + /// Destructor. Deallocates the memory used for FMA3 Groups. + ~X86InstrFMA3Info() { + std::set<const X86InstrFMA3Group *> DeletedGroups; + auto E = OpcodeToGroup.end(); + for (auto I = OpcodeToGroup.begin(); I != E; I++) { + const X86InstrFMA3Group *G = I->second; + if (DeletedGroups.find(G) == DeletedGroups.end()) { + DeletedGroups.insert(G); + delete G; + } + } + } + + /// Returns a reference to a group of FMA3 opcodes to where the given + /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3 + /// and not included into any FMA3 group, then nullptr is returned. + static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) { + // Ensure that the groups of opcodes are initialized. + initGroupsOnce(); + + // Find the group including the given opcode. + const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info(); + auto I = FMA3Info->OpcodeToGroup.find(Opcode); + if (I == FMA3Info->OpcodeToGroup.end()) + return nullptr; + + return I->second; + } + + /// Returns true iff the given \p Opcode is recognized as FMA3 by this class. + static bool isFMA3(unsigned Opcode) { + return getFMA3Group(Opcode) != nullptr; + } + + /// Iterator that is used to walk on FMA register opcodes having memory + /// form equivalents. + class rm_iterator { + private: + /// Iterator associated with the OpcodeToGroup map. It must always be + /// initialized with an entry from OpcodeToGroup for which I->first + /// points to a register FMA opcode and I->second points to a group of + /// FMA opcodes having memory form equivalent of I->first. + DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I; + + public: + /// Constructor. Creates rm_iterator. The parameter \p I must be an + /// iterator to OpcodeToGroup map entry having I->first pointing to + /// register form FMA opcode and I->second pointing to a group of FMA + /// opcodes holding memory form equivalent for I->fist. + rm_iterator(DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I) + : I(I) {} + + /// Returns the register form FMA opcode. + unsigned getRegOpcode() const { return I->first; }; + + /// Returns the memory form equivalent opcode for FMA register opcode + /// referenced by I->first. + unsigned getMemOpcode() const { + unsigned Opcode = I->first; + const X86InstrFMA3Group *Group = I->second; + return Group->getMemOpcode(Opcode); + } + + /// Returns a reference to a group of FMA opcodes. + const X86InstrFMA3Group *getGroup() const { return I->second; } + + bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; } + bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; } + + /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry + /// having I->first pointing to register form FMA and I->second pointing + /// to a group of FMA opcodes holding memory form equivalen for I->first. + rm_iterator &operator++() { + auto E = getX86InstrFMA3Info()->OpcodeToGroup.end(); + for (++I; I != E; ++I) { + unsigned RegOpcode = I->first; + const X86InstrFMA3Group *Group = I->second; + if (Group->getMemOpcode(RegOpcode) != 0) + break; + } + return *this; + } + }; + + /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map + /// with a register FMA opcode having memory form opcode equivalent. + static rm_iterator rm_begin() { + initGroupsOnce(); + const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info(); + auto I = FMA3Info->OpcodeToGroup.begin(); + auto E = FMA3Info->OpcodeToGroup.end(); + while (I != E) { + unsigned Opcode = I->first; + const X86InstrFMA3Group *G = I->second; + if (G->getMemOpcode(Opcode) != 0) + break; + I++; + } + return rm_iterator(I); + } + + /// Returns the last rm_iterator. + static rm_iterator rm_end() { + initGroupsOnce(); + return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end()); + } +}; +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td index 078dab4..10f3839 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -711,19 +711,19 @@ def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; // FP extensions map onto simple pseudo-value conversions if they are to/from // the FP stack. -def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, +def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, Requires<[FPStackf32]>; -def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, +def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, Requires<[FPStackf32]>; -def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, +def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, Requires<[FPStackf64]>; // FP truncations map onto simple pseudo-value conversions if they are to/from // the FP stack. We have validated that only value-preserving truncations make // it through isel. -def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, +def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, Requires<[FPStackf32]>; -def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, +def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, Requires<[FPStackf32]>; -def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, +def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, Requires<[FPStackf64]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td index 5183adc..610756a 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -18,43 +18,53 @@ class Format<bits<7> val> { bits<7> Value = val; } -def Pseudo : Format<0>; def RawFrm : Format<1>; -def AddRegFrm : Format<2>; def MRMDestReg : Format<3>; -def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>; -def MRMSrcMem : Format<6>; def RawFrmMemOffs : Format<7>; -def RawFrmSrc : Format<8>; def RawFrmDst : Format<9>; -def RawFrmDstSrc: Format<10>; -def RawFrmImm8 : Format<11>; -def RawFrmImm16 : Format<12>; -def MRMXr : Format<14>; def MRMXm : Format<15>; -def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>; -def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>; -def MRM6r : Format<22>; def MRM7r : Format<23>; -def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>; -def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>; -def MRM6m : Format<30>; def MRM7m : Format<31>; -def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>; -def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C5 : Format<37>; -def MRM_C6 : Format<38>; def MRM_C7 : Format<39>; def MRM_C8 : Format<40>; -def MRM_C9 : Format<41>; def MRM_CA : Format<42>; def MRM_CB : Format<43>; -def MRM_CC : Format<44>; def MRM_CD : Format<45>; def MRM_CE : Format<46>; -def MRM_CF : Format<47>; def MRM_D0 : Format<48>; def MRM_D1 : Format<49>; -def MRM_D2 : Format<50>; def MRM_D3 : Format<51>; def MRM_D4 : Format<52>; -def MRM_D5 : Format<53>; def MRM_D6 : Format<54>; def MRM_D7 : Format<55>; -def MRM_D8 : Format<56>; def MRM_D9 : Format<57>; def MRM_DA : Format<58>; -def MRM_DB : Format<59>; def MRM_DC : Format<60>; def MRM_DD : Format<61>; -def MRM_DE : Format<62>; def MRM_DF : Format<63>; def MRM_E0 : Format<64>; -def MRM_E1 : Format<65>; def MRM_E2 : Format<66>; def MRM_E3 : Format<67>; -def MRM_E4 : Format<68>; def MRM_E5 : Format<69>; def MRM_E6 : Format<70>; -def MRM_E7 : Format<71>; def MRM_E8 : Format<72>; def MRM_E9 : Format<73>; -def MRM_EA : Format<74>; def MRM_EB : Format<75>; def MRM_EC : Format<76>; -def MRM_ED : Format<77>; def MRM_EE : Format<78>; def MRM_EF : Format<79>; -def MRM_F0 : Format<80>; def MRM_F1 : Format<81>; def MRM_F2 : Format<82>; -def MRM_F3 : Format<83>; def MRM_F4 : Format<84>; def MRM_F5 : Format<85>; -def MRM_F6 : Format<86>; def MRM_F7 : Format<87>; def MRM_F8 : Format<88>; -def MRM_F9 : Format<89>; def MRM_FA : Format<90>; def MRM_FB : Format<91>; -def MRM_FC : Format<92>; def MRM_FD : Format<93>; def MRM_FE : Format<94>; -def MRM_FF : Format<95>; +def Pseudo : Format<0>; +def RawFrm : Format<1>; +def AddRegFrm : Format<2>; +def RawFrmMemOffs : Format<3>; +def RawFrmSrc : Format<4>; +def RawFrmDst : Format<5>; +def RawFrmDstSrc : Format<6>; +def RawFrmImm8 : Format<7>; +def RawFrmImm16 : Format<8>; +def MRMDestMem : Format<32>; +def MRMSrcMem : Format<33>; +def MRMSrcMem4VOp3 : Format<34>; +def MRMSrcMemOp4 : Format<35>; +def MRMXm : Format<39>; +def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>; +def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>; +def MRM6m : Format<46>; def MRM7m : Format<47>; +def MRMDestReg : Format<48>; +def MRMSrcReg : Format<49>; +def MRMSrcReg4VOp3 : Format<50>; +def MRMSrcRegOp4 : Format<51>; +def MRMXr : Format<55>; +def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>; +def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>; +def MRM6r : Format<62>; def MRM7r : Format<63>; +def MRM_C0 : Format<64>; def MRM_C1 : Format<65>; def MRM_C2 : Format<66>; +def MRM_C3 : Format<67>; def MRM_C4 : Format<68>; def MRM_C5 : Format<69>; +def MRM_C6 : Format<70>; def MRM_C7 : Format<71>; def MRM_C8 : Format<72>; +def MRM_C9 : Format<73>; def MRM_CA : Format<74>; def MRM_CB : Format<75>; +def MRM_CC : Format<76>; def MRM_CD : Format<77>; def MRM_CE : Format<78>; +def MRM_CF : Format<79>; def MRM_D0 : Format<80>; def MRM_D1 : Format<81>; +def MRM_D2 : Format<82>; def MRM_D3 : Format<83>; def MRM_D4 : Format<84>; +def MRM_D5 : Format<85>; def MRM_D6 : Format<86>; def MRM_D7 : Format<87>; +def MRM_D8 : Format<88>; def MRM_D9 : Format<89>; def MRM_DA : Format<90>; +def MRM_DB : Format<91>; def MRM_DC : Format<92>; def MRM_DD : Format<93>; +def MRM_DE : Format<94>; def MRM_DF : Format<95>; def MRM_E0 : Format<96>; +def MRM_E1 : Format<97>; def MRM_E2 : Format<98>; def MRM_E3 : Format<99>; +def MRM_E4 : Format<100>; def MRM_E5 : Format<101>; def MRM_E6 : Format<102>; +def MRM_E7 : Format<103>; def MRM_E8 : Format<104>; def MRM_E9 : Format<105>; +def MRM_EA : Format<106>; def MRM_EB : Format<107>; def MRM_EC : Format<108>; +def MRM_ED : Format<109>; def MRM_EE : Format<110>; def MRM_EF : Format<111>; +def MRM_F0 : Format<112>; def MRM_F1 : Format<113>; def MRM_F2 : Format<114>; +def MRM_F3 : Format<115>; def MRM_F4 : Format<116>; def MRM_F5 : Format<117>; +def MRM_F6 : Format<118>; def MRM_F7 : Format<119>; def MRM_F8 : Format<120>; +def MRM_F9 : Format<121>; def MRM_FA : Format<122>; def MRM_FB : Format<123>; +def MRM_FC : Format<124>; def MRM_FD : Format<125>; def MRM_FE : Format<126>; +def MRM_FF : Format<127>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -65,12 +75,13 @@ class ImmType<bits<4> val> { def NoImm : ImmType<0>; def Imm8 : ImmType<1>; def Imm8PCRel : ImmType<2>; -def Imm16 : ImmType<3>; -def Imm16PCRel : ImmType<4>; -def Imm32 : ImmType<5>; -def Imm32PCRel : ImmType<6>; -def Imm32S : ImmType<7>; -def Imm64 : ImmType<8>; +def Imm8Reg : ImmType<3>; // Register encoded in [7:4]. +def Imm16 : ImmType<4>; +def Imm16PCRel : ImmType<5>; +def Imm32 : ImmType<6>; +def Imm32PCRel : ImmType<7>; +def Imm32S : ImmType<8>; +def Imm64 : ImmType<9>; // FPFormat - This specifies what form this FP instruction has. This is used by // the Floating-Point stackifier pass. @@ -190,8 +201,6 @@ class TAXD : TA { Prefix OpPrefix = XD; } class VEX { Encoding OpEnc = EncVEX; } class VEX_W { bit hasVEX_WPrefix = 1; } class VEX_4V : VEX { bit hasVEX_4V = 1; } -class VEX_4VOp3 : VEX { bit hasVEX_4VOp3 = 1; } -class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } class EVEX : VEX { Encoding OpEnc = EncEVEX; } @@ -212,10 +221,8 @@ class EVEX_CD8<int esize, CD8VForm form> { } class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } -class MemOp4 { bit hasMemOp4Prefix = 1; } class XOP { Encoding OpEnc = EncXOP; } class XOP_4V : XOP { bit hasVEX_4V = 1; } -class XOP_4VOp3 : XOP { bit hasVEX_4VOp3 = 1; } class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, string AsmStr, @@ -265,10 +272,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bits<2> OpEncBits = OpEnc.Value; bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field? bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field? - bit hasVEX_4VOp3 = 0; // Does this inst require the VEX.VVVV field to - // encode the third operand? - bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register - // to be encoded in a immediate field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit bit hasEVEX_K = 0; // Does this inst require masking? @@ -280,7 +283,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, // assigning to bits<7>. int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes. bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? - bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction. bits<2> EVEX_LL; @@ -317,19 +319,15 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{38-31} = Opcode; let TSFlags{39} = hasVEX_WPrefix; let TSFlags{40} = hasVEX_4V; - let TSFlags{41} = hasVEX_4VOp3; - let TSFlags{42} = hasVEX_i8ImmReg; - let TSFlags{43} = hasVEX_L; - let TSFlags{44} = ignoresVEX_L; - let TSFlags{45} = hasEVEX_K; - let TSFlags{46} = hasEVEX_Z; - let TSFlags{47} = hasEVEX_L2; - let TSFlags{48} = hasEVEX_B; + let TSFlags{41} = hasVEX_L; + let TSFlags{42} = hasEVEX_K; + let TSFlags{43} = hasEVEX_Z; + let TSFlags{44} = hasEVEX_L2; + let TSFlags{45} = hasEVEX_B; // If we run out of TSFlags bits, it's possible to encode this in 3 bits. - let TSFlags{55-49} = CD8_Scale; - let TSFlags{56} = has3DNow0F0FOpcode; - let TSFlags{57} = hasMemOp4Prefix; - let TSFlags{58} = hasEVEX_RC; + let TSFlags{52-46} = CD8_Scale; + let TSFlags{53} = has3DNow0F0FOpcode; + let TSFlags{54} = hasEVEX_RC; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -351,6 +349,13 @@ class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, let Pattern = pattern; let CodeSize = 3; } +class Ii8Reg<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : X86Inst<o, f, Imm8Reg, outs, ins, asm, itin, d> { + let Pattern = pattern; + let CodeSize = 3; +} class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> { @@ -785,7 +790,6 @@ class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, Requires<[HasAVX512]>; class AVX512AIi8Base : TAPD { - Domain ExeDomain = SSEPackedInt; ImmType ImmT = Imm8; } class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, @@ -850,8 +854,8 @@ class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, // FMA4 Instruction Templates class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = NoItinerary> - : Ii8<o, F, outs, ins, asm, pattern, itin>, TAPD, - VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>; + : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD, + VEX_4V, FMASC, Requires<[HasFMA4]>; // XOP 2, 3 and 4 Operand Instruction Template class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, @@ -859,17 +863,22 @@ class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XOP9, Requires<[HasXOP]>; -// XOP 2, 3 and 4 Operand Instruction Templates with imm byte +// XOP 2 and 3 Operand Instruction Templates with imm byte class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XOP8, Requires<[HasXOP]>; +// XOP 4 Operand Instruction Templates with imm byte +class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, + XOP8, Requires<[HasXOP]>; // XOP 5 operand instruction (VEX encoding!) class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = NoItinerary> - : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, - VEX_4V, VEX_I8IMM, Requires<[HasXOP]>; + : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + VEX_4V, Requires<[HasXOP]>; // X86-64 Instruction templates... // diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index ea54f04..c5689d7 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -29,7 +29,6 @@ def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1, def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>; def load_mvmmx : PatFrag<(ops node:$ptr), (x86mmx (MMX_X86movw2d (load node:$ptr)))>; -def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; //===----------------------------------------------------------------------===// // SSE specific DAG Nodes. @@ -56,8 +55,7 @@ def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; -def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative]>; +def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>; @@ -67,16 +65,8 @@ def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; -def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; -def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; -def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", - SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, - SDTCisVT<1, v4i32>]>>; -def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD", - SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, - SDTCisVT<1, v4i32>]>>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; @@ -84,7 +74,7 @@ def X86psadbw : SDNode<"X86ISD::PSADBW", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, SDTCVecEltisVT<1, i8>, SDTCisSameSizeAs<0,1>, - SDTCisSameAs<1,2>]>>; + SDTCisSameAs<1,2>]>, [SDNPCommutative]>; def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, SDTCVecEltisVT<1, i8>, @@ -144,25 +134,14 @@ def X86vfpround: SDNode<"X86ISD::VFPROUND", SDTCVecEltisVT<1, f64>, SDTCisSameSizeAs<0, 1>]>>; -def X86fround: SDNode<"X86ISD::VFPROUND", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, - SDTCisSameAs<0, 1>, - SDTCVecEltisVT<2, f64>, - SDTCisSameSizeAs<0, 2>]>>; -def X86froundRnd: SDNode<"X86ISD::VFPROUND", +def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, SDTCisSameAs<0, 1>, SDTCVecEltisVT<2, f64>, SDTCisSameSizeAs<0, 2>, SDTCisVT<3, i32>]>>; -def X86fpext : SDNode<"X86ISD::VFPEXT", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, - SDTCisSameAs<0, 1>, - SDTCVecEltisVT<2, f32>, - SDTCisSameSizeAs<0, 2>]>>; - -def X86fpextRnd : SDNode<"X86ISD::VFPEXT", +def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>, SDTCisSameAs<0, 1>, SDTCVecEltisVT<2, f32>, @@ -176,7 +155,8 @@ def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; def X86IntCmpMask : SDTypeProfile<1, 2, - [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>; + [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisSameAs<1, 2>, SDTCisInt<1>, + SDTCisSameNumEltsAs<0, 1>]>; def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>; def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>; @@ -188,19 +168,19 @@ def X86CmpMaskCCRound : SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, - SDTCisInt<4>]>; + SDTCisVT<4, i32>]>; def X86CmpMaskCCScalar : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; def X86CmpMaskCCScalarRound : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, - SDTCisInt<4>]>; + SDTCisVT<4, i32>]>; def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; -def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; -def X86cmpmsRnd : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalarRound>; +def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; +def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>; def X86vshl : SDNode<"X86ISD::VSHL", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -212,7 +192,9 @@ def X86vsra : SDNode<"X86ISD::VSRA", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVec<2>]>>; -def X86vsrav : SDNode<"X86ISD::VSRAV" , SDTIntShiftOp>; +def X86vsrav : SDNode<"X86ISD::VSRAV" , + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; @@ -261,12 +243,12 @@ def SDTX86Testm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>, SDTCisSameNumEltsAs<0, 1>]>; -def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp>; +def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>; def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; -def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp>; +def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>; def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; -def X86mulhrs : SDNode<"X86ISD::MULHRS" , SDTIntBinOp>; -def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>; +def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; +def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; @@ -283,7 +265,7 @@ def X86select : SDNode<"X86ISD::SELECT", SDTCisSameAs<2, 3>, SDTCisSameNumEltsAs<0, 1>]>>; -def X86selects : SDNode<"X86ISD::SELECT", +def X86selects : SDNode<"X86ISD::SELECTS", SDTypeProfile<1, 3, [SDTCisVT<1, i1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>>; @@ -292,12 +274,14 @@ def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, SDTCVecEltisVT<1, i32>, SDTCisSameSizeAs<0,1>, - SDTCisSameAs<1,2>]>>; + SDTCisSameAs<1,2>]>, + [SDNPCommutative]>; def X86pmuldq : SDNode<"X86ISD::PMULDQ", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, SDTCVecEltisVT<1, i32>, SDTCisSameSizeAs<0,1>, - SDTCisSameAs<1,2>]>>; + SDTCisSameAs<1,2>]>, + [SDNPCommutative]>; def X86extrqi : SDNode<"X86ISD::EXTRQI", SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, @@ -393,7 +377,7 @@ def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>; -def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack, [SDNPCommutative]>; def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; @@ -410,10 +394,12 @@ def X86VPermt2 : SDNode<"X86ISD::VPERMV3", SDTCisSameSizeAs<0,2>, SDTCisSameAs<0,3>]>, []>; +// Even though the index operand should be integer, we need to make it match the +// destination type so that we can pattern match the masked version where the +// index is also the passthru operand. def X86VPermi2X : SDNode<"X86ISD::VPERMIV3", - SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, - SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, - SDTCisSameSizeAs<0,1>, + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>, []>; @@ -462,9 +448,9 @@ def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>; def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; -def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", SDTFPBinOpRound>; +def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>; def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; -def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", SDTFPBinOpRound>; +def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; @@ -480,6 +466,18 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>; def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>; def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>; +// Scalar FMA intrinsics with passthru bits in operand 1. +def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>; +def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>; +def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>; +def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>; + +// Scalar FMA intrinsics with passthru bits in operand 3. +def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound>; +def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>; +def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>; +def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>; + def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTFma>; def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTFma>; @@ -487,11 +485,11 @@ def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>; def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>; def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>; -def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", SDTFPBinOpRound>; -def X86rcp28s : SDNode<"X86ISD::RCP28", SDTFPBinOpRound>; -def X86RndScales : SDNode<"X86ISD::VRNDSCALE", SDTFPBinOpImmRound>; -def X86Reduces : SDNode<"X86ISD::VREDUCE", SDTFPBinOpImmRound>; -def X86GetMants : SDNode<"X86ISD::VGETMANT", SDTFPBinOpImmRound>; +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>; +def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>; +def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>; +def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImmRound>; +def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImmRound>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -515,59 +513,69 @@ def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisFP<1>]>; - def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisFP<1>, SDTCisVT<2, i32>]>; def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, SDTCisVec<1>, SDTCisVT<2, i32>]>; + +def SDTVintToFP: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisInt<1>]>; def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisInt<1>, SDTCisVT<2, i32>]>; // Scalar -def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>; -def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>; +def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>; +def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>; -def X86cvtts2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSFloatToIntRnd>; -def X86cvtts2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSFloatToIntRnd>; +def X86cvtts2IntRnd : SDNode<"X86ISD::CVTTS2SI_RND", SDTSFloatToIntRnd>; +def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND", SDTSFloatToIntRnd>; -def X86cvts2si : SDNode<"X86ISD::SCALAR_FP_TO_SINT_RND", SDTSFloatToIntRnd>; -def X86cvts2usi : SDNode<"X86ISD::SCALAR_FP_TO_UINT_RND", SDTSFloatToIntRnd>; +def X86cvts2si : SDNode<"X86ISD::CVTS2SI_RND", SDTSFloatToIntRnd>; +def X86cvts2usi : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>; // Vector with rounding mode // cvtt fp-to-int staff -def X86VFpToSintRnd : SDNode<"ISD::FP_TO_SINT", SDTFloatToIntRnd>; -def X86VFpToUintRnd : SDNode<"ISD::FP_TO_UINT", SDTFloatToIntRnd>; +def X86cvttp2siRnd : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>; +def X86cvttp2uiRnd : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>; -def X86VSintToFpRnd : SDNode<"ISD::SINT_TO_FP", SDTVintToFPRound>; -def X86VUintToFpRnd : SDNode<"ISD::UINT_TO_FP", SDTVintToFPRound>; +def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>; +def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>; // cvt fp-to-int staff -def X86cvtp2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToIntRnd>; -def X86cvtp2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToIntRnd>; +def X86cvtp2IntRnd : SDNode<"X86ISD::CVTP2SI_RND", SDTFloatToIntRnd>; +def X86cvtp2UIntRnd : SDNode<"X86ISD::CVTP2UI_RND", SDTFloatToIntRnd>; // Vector without rounding mode -def X86cvtp2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToInt>; -def X86cvtp2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>; -def X86cvtph2ps : SDNode<"ISD::FP16_TO_FP", +// cvtt fp-to-int staff +def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt>; +def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt>; + +def X86VSintToFP : SDNode<"X86ISD::CVTSI2P", SDTVintToFP>; +def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>; + +// cvt int-to-fp staff +def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>; +def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>; + +def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, i16>, SDTCisVT<2, i32>]> >; -def X86cvtps2ph : SDNode<"ISD::FP_TO_FP16", - SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, +def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, SDTCVecEltisVT<1, f32>, - SDTCisVT<2, i32>, - SDTCisVT<3, i32>]> >; -def X86vfpextRnd : SDNode<"X86ISD::VFPEXT", + SDTCisVT<2, i32>]> >; +def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, SDTCVecEltisVT<1, f32>, SDTCisOpSmallerThanOp<1, 0>, SDTCisVT<2, i32>]>>; -def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", +def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, SDTCisOpSmallerThanOp<0, 1>, @@ -621,9 +629,6 @@ def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; // 512-bit load pattern fragments def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; -def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>; -def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>; -def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>; def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; // 128-/256-/512-bit extload pattern fragments @@ -631,15 +636,6 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; -// These are needed to match a scalar load that is used in a vector-only -// math instruction such as the FP logical ops: andps, andnps, orps, xorps. -// The memory operand is required to be a 128-bit load, so it must be converted -// from a vector to a scalar. -def loadf32_128 : PatFrag<(ops node:$ptr), - (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>; -def loadf64_128 : PatFrag<(ops node:$ptr), - (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>; - // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ @@ -673,11 +669,6 @@ def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast<LoadSDNode>(N)->getAlignment() >= 64; }]>; -def alignedloadfsf32 : PatFrag<(ops node:$ptr), - (f32 (alignedload node:$ptr))>; -def alignedloadfsf64 : PatFrag<(ops node:$ptr), - (f64 (alignedload node:$ptr))>; - // 128-bit aligned load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 def alignedloadv4f32 : PatFrag<(ops node:$ptr), @@ -699,8 +690,6 @@ def alignedloadv4i64 : PatFrag<(ops node:$ptr), // 512-bit aligned load pattern fragments def alignedloadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (alignedload512 node:$ptr))>; -def alignedloadv16i32 : PatFrag<(ops node:$ptr), - (v16i32 (alignedload512 node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (alignedload512 node:$ptr))>; def alignedloadv8i64 : PatFrag<(ops node:$ptr), @@ -717,9 +706,6 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ || cast<LoadSDNode>(N)->getAlignment() >= 16; }]>; -def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; -def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; - // 128-bit memop pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; @@ -853,6 +839,7 @@ def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>; // 512-bit bitconvert pattern fragments +def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>; def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>; def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; @@ -873,6 +860,10 @@ def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); }]>; +def fp64imm0 : PatLeaf<(f64 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + def I8Imm : SDNodeXForm<imm, [{ // Transformation function: get the low 8 bits. return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); @@ -940,30 +931,36 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, return X86::isVINSERT256Index(N); }], INSERT_get_vinsert256_imm>; -def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), +def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_load node:$src1, node:$src2, node:$src3), [{ - if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) - return Load->getAlignment() >= 16; - return false; + return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() && + cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mload node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16; }]>; def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_load node:$src1, node:$src2, node:$src3), [{ - if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) - return Load->getAlignment() >= 32; - return false; + (X86mload node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32; }]>; def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_load node:$src1, node:$src2, node:$src3), [{ - if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) - return Load->getAlignment() >= 64; - return false; + (X86mload node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64; }]>; def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_load node:$src1, node:$src2, node:$src3), [{ - return isa<MaskedLoadSDNode>(N); + return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() && + cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->isExpandingLoad(); }]>; // Masked store fragments. @@ -971,33 +968,34 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), // do not support vector types (llvm-tblgen will fail). def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_store node:$src1, node:$src2, node:$src3), [{ - return !cast<MaskedStoreSDNode>(N)->isTruncatingStore(); + return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) && + (!cast<MaskedStoreSDNode>(N)->isCompressingStore()); }]>; def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86mstore node:$src1, node:$src2, node:$src3), [{ - if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) - return Store->getAlignment() >= 16; - return false; + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16; }]>; def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86mstore node:$src1, node:$src2, node:$src3), [{ - if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) - return Store->getAlignment() >= 32; - return false; + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32; }]>; def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86mstore node:$src1, node:$src2, node:$src3), [{ - if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) - return Store->getAlignment() >= 64; - return false; + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64; }]>; def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86mstore node:$src1, node:$src2, node:$src3), [{ - return isa<MaskedStoreSDNode>(N); + (masked_store node:$src1, node:$src2, node:$src3), [{ + return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) && + (!cast<MaskedStoreSDNode>(N)->isCompressingStore()); +}]>; + +def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isCompressingStore(); }]>; // masked truncstore fragments @@ -1022,3 +1020,80 @@ def masked_truncstorevi32 : (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; + +def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTMaskedStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTMaskedStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncSStore node:$val, node:$ptr), [{ + return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncUSStore node:$val, node:$ptr), [{ + return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncSStore node:$val, node:$ptr), [{ + return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncUSStore node:$val, node:$ptr), [{ + return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncSStore node:$val, node:$ptr), [{ + return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncUSStore node:$val, node:$ptr), [{ + return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def assertzext_i1 : + PatFrag<(ops node:$src), (assertzext node:$src), [{ + return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1; +}]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 5f0aab9..627b612 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -68,7 +68,7 @@ static cl::opt<unsigned> UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), - cl::init(64), cl::Hidden); + cl::init(128), cl::Hidden); enum { // Select which memory operand is being unfolded. @@ -228,12 +228,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SBB64ri32, X86::SBB64mi32, 0 }, { X86::SBB64ri8, X86::SBB64mi8, 0 }, { X86::SBB64rr, X86::SBB64mr, 0 }, + { X86::SHL16r1, X86::SHL16m1, 0 }, { X86::SHL16rCL, X86::SHL16mCL, 0 }, { X86::SHL16ri, X86::SHL16mi, 0 }, + { X86::SHL32r1, X86::SHL32m1, 0 }, { X86::SHL32rCL, X86::SHL32mCL, 0 }, { X86::SHL32ri, X86::SHL32mi, 0 }, + { X86::SHL64r1, X86::SHL64m1, 0 }, { X86::SHL64rCL, X86::SHL64mCL, 0 }, { X86::SHL64ri, X86::SHL64mi, 0 }, + { X86::SHL8r1, X86::SHL8m1, 0 }, { X86::SHL8rCL, X86::SHL8mCL, 0 }, { X86::SHL8ri, X86::SHL8mi, 0 }, { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 }, @@ -335,6 +339,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE }, { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, @@ -380,6 +385,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE }, { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE }, { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE }, { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE }, @@ -394,10 +400,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE }, { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, // AVX-512 foldable instructions + { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE }, + { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE }, { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, @@ -409,8 +425,27 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, + { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE }, + { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE }, + { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE }, + { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE }, + { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE }, + { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE }, + { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE }, + { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE }, + { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE }, + { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE }, + { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE }, // AVX-512 foldable instructions (256-bit versions) + { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE }, + { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE }, + { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE }, + { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -421,6 +456,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, + { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE }, + { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE }, // AVX-512 foldable instructions (128-bit versions) { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -471,26 +515,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, - { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 }, - { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 }, - { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 }, - { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, - { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, - { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, - { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 }, + { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE }, + { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE }, + { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE }, + { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE }, + { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE }, + { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE }, + { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE }, { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, - { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 }, + { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, - { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, - { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 }, - { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 }, - { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 }, - { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 }, - { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 }, + { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE }, + { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE }, + { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE }, + { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE }, + { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE }, + { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE }, { X86::MOV16rr, X86::MOV16rm, 0 }, { X86::MOV32rr, X86::MOV32rm, 0 }, { X86::MOV64rr, X86::MOV64rm, 0 }, @@ -499,10 +543,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOV8rr, X86::MOV8rm, 0 }, { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, - { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 }, + { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE }, { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, + { X86::MOVDQUrr, X86::MOVDQUrm, 0 }, { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 }, { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 }, @@ -511,51 +556,53 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 }, { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 }, { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, - { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, + { X86::MOVUPDrr, X86::MOVUPDrm, 0 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, - { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, - { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, - { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, + { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 }, + { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 }, + { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 }, { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 }, { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 }, { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 }, { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 }, { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 }, - { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 }, - { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 }, - { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 }, - { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 }, - { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 }, - { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 }, - { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 }, - { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 }, - { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 }, - { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 }, - { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 }, - { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 }, + { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE }, + { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE }, + { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE }, + { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE }, + { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE }, + { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE }, + { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE }, + { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE }, + { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE }, + { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE }, + { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE }, + { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE }, { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 }, { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, { X86::RCPSSr, X86::RCPSSm, 0 }, - { X86::RCPSSr_Int, X86::RCPSSm_Int, 0 }, + { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE }, { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, + { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, + { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, - { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 }, + { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE }, { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, { X86::SQRTSDr, X86::SQRTSDm, 0 }, - { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 }, + { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE }, { X86::SQRTSSr, X86::SQRTSSm, 0 }, - { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 }, + { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE }, { X86::TEST16rr, X86::TEST16rm, 0 }, { X86::TEST32rr, X86::TEST32rm, 0 }, { X86::TEST64rr, X86::TEST64rm, 0 }, @@ -586,46 +633,47 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PSWAPDrr, X86::PSWAPDrm, 0 }, // AVX 128-bit versions of foldable instructions - { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, - { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, - { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 }, - { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 }, + { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE }, + { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE }, + { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE }, + { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE }, { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, - { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 }, + { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE }, { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, - { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 }, + { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE }, { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, - { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 }, + { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE }, { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, - { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 }, - { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, - { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, - { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, - { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, - { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 }, + { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE }, + { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE }, + { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE }, + { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE }, + { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE }, + { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, - { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, - { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 }, + { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 }, + { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 }, { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, - { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 }, - { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, + { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 }, { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, - { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 }, + { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE }, { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, + { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 }, { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 }, { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, - { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, - { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, - { X86::VPABSWrr128, X86::VPABSWrm128, 0 }, + { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE }, + { X86::VPABSBrr, X86::VPABSBrm, 0 }, + { X86::VPABSDrr, X86::VPABSDrm, 0 }, + { X86::VPABSWrr, X86::VPABSWrm, 0 }, { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, @@ -633,18 +681,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 }, { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, - { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 }, - { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 }, - { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 }, - { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 }, - { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 }, - { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 }, - { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 }, - { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 }, - { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 }, - { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 }, - { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 }, - { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 }, + { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE }, + { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE }, + { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE }, + { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE }, + { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE }, + { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE }, + { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE }, + { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE }, + { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE }, + { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE }, + { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE }, + { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE }, { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, @@ -661,18 +709,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, // AVX 256-bit foldable instructions - { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, + { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE }, { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, - { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 }, + { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE }, { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 }, { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, + { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 }, { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 }, { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 }, { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, @@ -699,31 +748,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, - { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, - { X86::VPABSDrr256, X86::VPABSDrm256, 0 }, - { X86::VPABSWrr256, X86::VPABSWrm256, 0 }, - { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 }, - { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 }, - { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 }, - { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 }, - { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 }, - { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 }, - { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 }, - { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 }, + { X86::VPABSBYrr, X86::VPABSBYrm, 0 }, + { X86::VPABSDYrr, X86::VPABSDYrm, 0 }, + { X86::VPABSWYrr, X86::VPABSWYrm, 0 }, + { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE }, + { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE }, + { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE }, + { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE }, + { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE }, + { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE }, + { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE }, + { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE }, { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, { X86::VPERMQYri, X86::VPERMQYmi, 0 }, - { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 }, - { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 }, + { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE }, + { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE }, { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 }, { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 }, { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 }, - { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 }, - { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 }, - { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 }, + { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE }, + { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE }, + { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE }, { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 }, { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 }, { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 }, - { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 }, + { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE }, { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, @@ -817,7 +866,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, // AVX-512 foldable instructions + { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE }, { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, + { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE }, { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, @@ -831,12 +885,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, - { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, - { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 }, + { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 }, + { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, + { X86::VPERMQZri, X86::VPERMQZmi, 0 }, + { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 }, + { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE }, + { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 }, + { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 }, + { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 }, + { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 }, + { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 }, + { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE }, + { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 }, + { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 }, + { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 }, + { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 }, + { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 }, + { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 }, + { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 }, // AVX-512 foldable instructions (256-bit versions) + { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, @@ -847,12 +920,29 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, - { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 }, + { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 }, + { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 }, + { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 }, + { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 }, + { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 }, + { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 }, + { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 }, + { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 }, + { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 }, + { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE }, + { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 }, + { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 }, + { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 }, // AVX-512 foldable instructions (128-bit versions) + { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, @@ -863,8 +953,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, - { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 }, + { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 }, + { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE }, + { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE }, + { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE }, + { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 }, + { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 }, + { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 }, + // F16C foldable instructions { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, @@ -896,9 +1002,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, { X86::ADDSDrr, X86::ADDSDrm, 0 }, - { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 }, + { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE }, { X86::ADDSSrr, X86::ADDSSrm, 0 }, - { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 }, + { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE }, { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, { X86::AND16rr, X86::AND16rm, 0 }, @@ -970,24 +1076,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, { X86::DIVSDrr, X86::DIVSDrm, 0 }, - { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 }, + { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE }, { X86::DIVSSrr, X86::DIVSSrm, 0 }, - { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 }, + { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE }, { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, - - // Do not fold Fs* scalar logical op loads because there are no scalar - // load variants for these instructions. When folded, the load is required - // to be 128-bits, so the load size would not match. - - { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 }, - { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 }, - { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 }, - { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 }, - { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 }, - { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 }, - { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 }, - { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 }, { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, @@ -995,34 +1088,42 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::IMUL16rr, X86::IMUL16rm, 0 }, { X86::IMUL32rr, X86::IMUL32rm, 0 }, { X86::IMUL64rr, X86::IMUL64rm, 0 }, - { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 }, - { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 }, - { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 }, + { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE }, + { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE }, + { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE }, { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, - { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 }, + { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE }, { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, + { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 }, { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, + { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 }, { X86::MAXSDrr, X86::MAXSDrm, 0 }, - { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 }, + { X86::MAXCSDrr, X86::MAXCSDrm, 0 }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE }, { X86::MAXSSrr, X86::MAXSSrm, 0 }, - { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 }, + { X86::MAXCSSrr, X86::MAXCSSrm, 0 }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE }, { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, + { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 }, { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, + { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 }, { X86::MINSDrr, X86::MINSDrm, 0 }, - { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, + { X86::MINCSDrr, X86::MINCSDrm, 0 }, + { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE }, { X86::MINSSrr, X86::MINSSrm, 0 }, - { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, + { X86::MINCSSrr, X86::MINCSSrm, 0 }, + { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE }, { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, { X86::MULSDrr, X86::MULSDrm, 0 }, - { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 }, + { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE }, { X86::MULSSrr, X86::MULSSrm, 0 }, - { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 }, + { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE }, { X86::OR16rr, X86::OR16rm, 0 }, { X86::OR32rr, X86::OR32rm, 0 }, { X86::OR64rr, X86::OR64rm, 0 }, @@ -1067,7 +1168,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PINSRDrr, X86::PINSRDrm, 0 }, { X86::PINSRQrr, X86::PINSRQrm, 0 }, { X86::PINSRWrri, X86::PINSRWrmi, 0 }, - { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 }, + { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 }, { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, @@ -1082,7 +1183,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, - { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 }, + { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 }, { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, @@ -1119,8 +1220,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, - { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, - { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, + { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE }, + { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE }, { X86::SBB32rr, X86::SBB32rm, 0 }, { X86::SBB64rr, X86::SBB64rm, 0 }, { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, @@ -1132,9 +1233,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, { X86::SUBSDrr, X86::SUBSDrm, 0 }, - { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 }, + { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, - { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 }, + { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE }, // FIXME: TEST*rr -> swapped operand of TEST*mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, @@ -1240,7 +1341,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX 128-bit versions of foldable instructions { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, - { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 }, + { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, TB_NO_REVERSE }, { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, @@ -1250,21 +1351,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, - { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, - { X86::VRCPSSr, X86::VRCPSSm, 0 }, - { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 }, - { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, - { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 }, - { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, - { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 }, - { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, - { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 }, + { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, TB_NO_REVERSE }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, - { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 }, + { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, - { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 }, + { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE }, { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, @@ -1282,48 +1375,45 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, - { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 }, + { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, - { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 }, + { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE }, { X86::VDPPDrri, X86::VDPPDrmi, 0 }, { X86::VDPPSrri, X86::VDPPSrmi, 0 }, - // Do not fold VFs* loads because there are no scalar load variants for - // these instructions. When folded, the load is required to be 128-bits, so - // the load size would not match. - { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 }, - { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 }, - { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 }, - { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 }, - { X86::VFvORPDrr, X86::VFvORPDrm, 0 }, - { X86::VFvORPSrr, X86::VFvORPSrm, 0 }, - { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 }, - { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 }, { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, - { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 }, - { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 }, + { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE }, + { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE }, + { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 }, + { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 }, + { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 }, + { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 }, { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, - { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 }, + { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, - { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 }, + { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE }, + { X86::VMINCPDrr, X86::VMINCPDrm, 0 }, + { X86::VMINCPSrr, X86::VMINCPSrm, 0 }, + { X86::VMINCSDrr, X86::VMINCSDrm, 0 }, + { X86::VMINCSSrr, X86::VMINCSSrm, 0 }, { X86::VMINPDrr, X86::VMINPDrm, 0 }, { X86::VMINPSrr, X86::VMINPSrm, 0 }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, - { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, + { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, - { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, + { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE }, { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE }, { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, { X86::VMULPSrr, X86::VMULPSrm, 0 }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, - { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 }, + { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, - { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 }, + { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE }, { X86::VORPDrr, X86::VORPDrm, 0 }, { X86::VORPSrr, X86::VORPSrm, 0 }, { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, @@ -1366,7 +1456,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPINSRDrr, X86::VPINSRDrm, 0 }, { X86::VPINSRQrr, X86::VPINSRQrm, 0 }, { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, - { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 }, + { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 }, { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, @@ -1381,7 +1471,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, - { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 }, + { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 }, { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, { X86::VPMULHWrr, X86::VPMULHWrm, 0 }, { X86::VPMULLDrr, X86::VPMULLDrm, 0 }, @@ -1418,16 +1508,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, { X86::VPXORrr, X86::VPXORrm, 0 }, + { X86::VRCPSSr, X86::VRCPSSm, 0 }, + { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE }, + { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, + { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE }, { X86::VROUNDSDr, X86::VROUNDSDm, 0 }, + { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE }, { X86::VROUNDSSr, X86::VROUNDSSm, 0 }, + { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE }, { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, + { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, + { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE }, + { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, + { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE }, { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, - { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 }, + { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, - { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 }, + { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE }, { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, @@ -1458,8 +1558,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 }, { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 }, + { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 }, + { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 }, { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, + { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 }, + { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 }, { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, { X86::VMULPDYrr, X86::VMULPDYrm, 0 }, @@ -1520,7 +1624,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 }, { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 }, { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, - { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 }, + { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 }, { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, @@ -1536,7 +1640,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, - { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 }, + { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 }, { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 }, { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 }, { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 }, @@ -1559,8 +1663,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, - { X86::VPSRAVD_Intrr, X86::VPSRAVD_Intrm, 0 }, - { X86::VPSRAVD_IntYrr, X86::VPSRAVD_IntYrm, 0 }, { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, @@ -1588,37 +1690,45 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // FMA4 foldable patterns { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE }, + { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE }, { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE }, + { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE }, { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE }, { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE }, - { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE }, - { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE }, + { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE }, { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE }, + { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE }, { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE }, + { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE }, { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE }, - { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE }, - { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE }, + { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE }, { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE }, + { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE }, { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE }, + { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE }, { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE }, { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE }, - { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE }, + { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE }, { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE }, { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE }, { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE }, { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE }, { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE }, { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE }, { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE }, { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE }, // XOP foldable instructions { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 }, @@ -1678,38 +1788,107 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::ADOX64rr, X86::ADOX64rm, 0 }, // AVX-512 foldable instructions - { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, - { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, - { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, 0 }, + { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDSDZrr, X86::VADDSDZrm, 0 }, - { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, 0 }, - { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, - { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, - { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, - { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, 0 }, - { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, - { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, 0 }, - { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, - { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, - { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, - { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, 0 }, - { X86::VMULSDZrr, X86::VMULSDZrm, 0 }, - { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, 0 }, - { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, + { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE }, + { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, + { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE }, + { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, + { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, + { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 }, + { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 }, + { X86::VANDPDZrr, X86::VANDPDZrm, 0 }, + { X86::VANDPSZrr, X86::VANDPSZrm, 0 }, + { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, + { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 }, + { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 }, + { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 }, + { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE }, + { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 }, + { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE }, { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, - { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, - { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, 0 }, + { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 }, - { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, 0 }, - { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, - { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, - { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, + { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE }, + { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, + { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE }, + { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 }, + { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 }, + { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 }, + { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 }, + { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 }, + { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 }, + { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 }, + { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 }, + { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 }, + { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 }, + { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 }, + { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 }, { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, + { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, + { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 }, + { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE }, + { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 }, + { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE }, + { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 }, + { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 }, + { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 }, + { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 }, + { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, + { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, + { X86::VMINSDZrr, X86::VMINSDZrm, 0 }, + { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE }, + { X86::VMINSSZrr, X86::VMINSSZrm, 0 }, + { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE }, + { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, + { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, + { X86::VMULSDZrr, X86::VMULSDZrm, 0 }, + { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE }, + { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, + { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE }, + { X86::VORPDZrr, X86::VORPDZrm, 0 }, + { X86::VORPSZrr, X86::VORPSZrm, 0 }, + { X86::VPADDBZrr, X86::VPADDBZrm, 0 }, { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, - { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, + { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 }, + { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 }, + { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 }, + { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 }, + { X86::VPADDWZrr, X86::VPADDWZrm, 0 }, + { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 }, + { X86::VPANDDZrr, X86::VPANDDZrm, 0 }, + { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 }, + { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 }, + { X86::VPANDQZrr, X86::VPANDQZrm, 0 }, + { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 }, + { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 }, + { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 }, + { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 }, + { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 }, + { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 }, + { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 }, + { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 }, + { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 }, + { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 }, + { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 }, + { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 }, + { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 }, + { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 }, + { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 }, + { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 }, + { X86::VPERMBZrr, X86::VPERMBZrm, 0 }, + { X86::VPERMDZrr, X86::VPERMDZrm, 0 }, + { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 }, + { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 }, + { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 }, { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, + { X86::VPERMQZrr, X86::VPERMQZrm, 0 }, + { X86::VPERMWZrr, X86::VPERMWZrm, 0 }, + { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 }, + { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 }, { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 }, { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 }, { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 }, @@ -1719,31 +1898,297 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 }, { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 }, { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 }, + { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, + { X86::VPORDZrr, X86::VPORDZrm, 0 }, + { X86::VPORQZrr, X86::VPORQZrm, 0 }, + { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 }, { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, + { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 }, { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 }, { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, + { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 }, + { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 }, + { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 }, + { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 }, + { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 }, + { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 }, + { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 }, + { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 }, + { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 }, + { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 }, + { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 }, + { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 }, + { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 }, + { X86::VPXORDZrr, X86::VPXORDZrm, 0 }, + { X86::VPXORQZrr, X86::VPXORQZrm, 0 }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, - { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, - { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, - { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, - { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, - - // AVX-512{F,VL} foldable instructions - { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, + { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, + { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, + { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, + { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE }, + { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, + { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE }, + { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 }, + { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 }, + { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 }, + { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 }, + { X86::VXORPDZrr, X86::VXORPDZrm, 0 }, + { X86::VXORPSZrr, X86::VXORPSZrm, 0 }, // AVX-512{F,VL} foldable instructions { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 }, { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, + { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 }, + { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 }, + { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 }, + { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 }, + { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 }, + { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 }, + { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 }, + { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 }, + { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 }, + { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 }, + { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 }, + { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 }, + { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, + { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 }, + { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 }, + { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 }, + { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 }, + { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 }, + { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 }, + { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 }, + { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 }, + { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 }, + { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 }, + { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 }, + { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 }, + { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 }, + { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 }, + { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 }, + { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 }, + { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 }, + { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 }, + { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 }, + { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 }, + { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 }, + { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 }, + { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 }, + { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 }, + { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 }, + { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 }, + { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 }, + { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 }, + { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 }, + { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 }, + { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 }, + { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 }, + { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 }, + { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 }, + { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 }, + { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 }, + { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 }, + { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 }, + { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 }, + { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 }, + { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 }, + { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 }, + { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 }, + { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 }, + { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 }, + { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 }, + { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 }, + { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 }, + { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 }, + { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 }, + { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 }, + { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 }, + { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 }, + { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 }, + { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 }, + { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 }, + { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 }, + { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 }, + { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 }, + { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 }, + { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 }, + { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 }, + { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 }, + { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 }, + { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 }, + { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 }, + { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 }, + { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 }, + { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 }, + { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 }, + { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 }, + { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 }, + { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 }, + { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 }, + { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 }, + { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 }, + { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 }, + { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 }, + { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 }, + { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 }, + { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 }, + { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 }, + { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 }, + { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 }, + { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 }, + { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 }, + { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 }, + { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 }, + { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 }, + { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 }, + { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 }, + { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 }, + { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 }, + { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 }, + { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 }, + { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 }, + { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 }, + { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 }, + { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 }, + { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 }, + { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 }, + { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 }, + { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 }, + { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 }, + { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 }, + { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 }, + { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 }, + { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 }, + { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 }, + { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 }, + { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 }, + { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 }, + { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 }, + { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 }, + { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 }, + { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 }, + { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 }, + { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 }, + { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 }, + { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 }, + { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 }, + { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 }, + { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 }, + { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 }, + { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 }, + { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 }, + { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 }, + { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 }, + { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 }, + { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 }, + { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 }, + { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 }, + { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 }, + { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 }, + { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 }, + { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 }, + { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 }, + { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 }, + { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 }, + { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 }, + { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 }, + { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 }, + { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 }, + { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 }, + { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 }, + { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 }, + { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 }, + { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 }, + { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 }, + { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 }, + { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 }, + { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 }, + { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 }, + { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 }, + { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 }, + { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 }, + { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 }, + { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 }, + { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 }, + { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 }, + { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 }, + { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 }, + { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 }, + { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 }, + { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 }, + { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 }, + { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 }, + { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 }, + + // AVX-512 masked foldable instructions + { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 }, + { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 }, + { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 }, + { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 }, + { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 }, + { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 }, + { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 }, + { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 }, + { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 }, + { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 }, + { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 }, + { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 }, + { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 }, + { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 }, + { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 }, + { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 }, + { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 }, + + // AVX-512VL 256-bit masked foldable instructions + { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 }, + { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 }, + { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 }, + { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 }, + { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 }, + { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 }, + { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 }, + { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 }, + { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 }, + { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 }, + { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE }, + { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 }, + { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 }, + { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 }, + + // AVX-512VL 128-bit masked foldable instructions + { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 }, + { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 }, + { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE }, + { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE }, + { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 }, + { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 }, + { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 }, // AES foldable instructions { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, @@ -1773,170 +2218,47 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) } static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { - // FMA foldable instructions - { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, - { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, - { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, - { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, - { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, - { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE }, - { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, - { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE }, - - { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, - { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, - { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE }, - { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE }, - { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE }, - { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE }, - { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE }, - { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE }, - { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE }, - { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE }, - { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE }, - { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, - - { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, - { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, - { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, - { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, - { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, - { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE }, - { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, - { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE }, - - { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, - { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, - { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE }, - { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE }, - { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE }, - { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE }, - { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE }, - { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE }, - { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE }, - { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE }, - { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE }, - { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, - - { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, - { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, - { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, - { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, - { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, - { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE }, - { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, - { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE }, - - { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, - { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, - { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE }, - { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE }, - { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE }, - { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE }, - { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE }, - { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE }, - { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE }, - { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE }, - { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE }, - { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, - - { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, - { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, - { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE }, - - { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE }, - { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE }, - - { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE }, - - { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE }, - // FMA4 foldable patterns { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE }, + { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE }, { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE }, + { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE }, { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE }, { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE }, - { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE }, - { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE }, + { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE }, { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE }, + { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE }, { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE }, + { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE }, { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE }, - { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE }, - { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE }, + { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE }, { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE }, + { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE }, { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE }, + { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE }, { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE }, { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE }, - { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE }, + { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE }, { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE }, { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE }, { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE }, { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE }, { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE }, { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE }, { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE }, { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE }, // XOP foldable instructions { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 }, @@ -1947,11 +2269,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 }, { X86::VPPERMrrr, X86::VPPERMrrm, 0 }, - // AVX-512 VPERMI instructions with 3 source operands. - { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, - { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, - { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, - { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, + // AVX-512 instructions with 3 source operands. { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, @@ -1961,45 +2279,349 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, - // AVX-512 arithmetic instructions - { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 }, + { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, + { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, + { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, + { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, + { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 }, + { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 }, + { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 }, + { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 }, + { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 }, + { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 }, + { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 }, + { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 }, + { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 }, + + // AVX-512VL 256-bit instructions with 3 source operands. + { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 }, + { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 }, + { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 }, + { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 }, + { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 }, + { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 }, + { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 }, + { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 }, + { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 }, + { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 }, + { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 }, + { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 }, + { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 }, + { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 }, + + // AVX-512VL 128-bit instructions with 3 source operands. + { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 }, + { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 }, + { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 }, + { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 }, + { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 }, + { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 }, + { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 }, + { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 }, + { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 }, + { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 }, + { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 }, + { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 }, + { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 }, + { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 }, + + // AVX-512 masked instructions { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, - { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, - { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, - { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, - { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, - { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 }, + { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 }, + { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 }, + { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 }, + { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 }, + { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 }, { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, - { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, - { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, - { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 }, + { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 }, + { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 }, + { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 }, + { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 }, + { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 }, + { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 }, + { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 }, + { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 }, + { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 }, { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, - // AVX-512{F,VL} arithmetic instructions 256-bit - { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, + { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 }, + { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 }, + { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, + { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, + { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, + { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, + { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 }, + { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 }, + { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 }, + { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 }, + { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 }, + { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 }, + { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 }, + { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 }, + { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 }, + { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 }, + { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 }, + { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 }, + { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 }, + { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 }, + { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 }, + { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 }, + { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 }, + { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 }, + { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 }, + { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 }, + { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 }, + { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 }, + { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 }, + { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 }, + { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 }, + { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 }, + { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 }, + { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 }, + { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 }, + { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 }, + { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 }, + { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 }, + { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 }, + { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 }, + { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 }, + { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 }, + { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 }, + { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 }, + { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 }, + { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 }, + { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 }, + { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 }, + { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 }, + { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 }, + { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 }, + { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 }, + { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, + { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, + { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 }, + { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 }, + { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 }, + { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 }, + { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 }, + { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 }, + + // AVX-512{F,VL} masked arithmetic instructions 256-bit { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, - { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, - { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, - { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, - { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, - { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, + { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, + { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 }, + { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 }, + { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 }, + { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 }, + { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 }, + { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 }, { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, - { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, - { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, - { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, + { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, + { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 }, + { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 }, + { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 }, + { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 }, + { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 }, + { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 }, { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, - // AVX-512{F,VL} arithmetic instructions 128-bit - { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, + { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, + { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 }, + { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 }, + { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, + { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, + { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, + { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, + { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 }, + { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 }, + { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 }, + { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 }, + { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 }, + { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 }, + { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 }, + { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 }, + { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 }, + { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 }, + { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 }, + { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 }, + { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 }, + { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 }, + { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 }, + { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 }, + { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 }, + { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 }, + { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 }, + { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 }, + { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 }, + { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 }, + { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 }, + { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 }, + { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 }, + { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 }, + { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 }, + { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 }, + { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 }, + { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 }, + { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 }, + { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 }, + { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 }, + { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 }, + { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 }, + { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 }, + { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 }, + { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 }, + { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 }, + { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 }, + { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 }, + { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 }, + { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 }, + { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 }, + { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 }, + { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 }, + { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, + { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, + { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 }, + { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 }, + { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 }, + { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 }, + { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 }, + { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 }, + + // AVX-512{F,VL} masked arithmetic instructions 128-bit { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, - { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, - { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, - { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, - { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, - { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, + { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, + { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 }, + { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 }, + { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 }, + { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 }, + { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 }, + { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 }, { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, - { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, - { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, + { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, + { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 }, + { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 }, + { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }, { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, - { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 } + { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 }, + { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 }, + { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, + { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, + { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, + { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, + { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 }, + { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 }, + { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 }, + { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 }, + { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 }, + { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 }, + { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 }, + { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 }, + { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 }, + { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 }, + { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 }, + { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 }, + { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 }, + { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 }, + { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 }, + { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 }, + { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 }, + { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 }, + { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 }, + { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 }, + { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 }, + { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 }, + { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 }, + { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 }, + { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 }, + { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 }, + { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 }, + { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 }, + { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 }, + { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 }, + { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 }, + { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 }, + { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 }, + { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 }, + { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 }, + { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 }, + { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 }, + { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 }, + { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 }, + { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 }, + { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 }, + { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 }, + { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, + { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, + { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 }, + { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 }, + { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 }, + { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 }, + { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 }, + { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 }, + + // AVX-512 masked foldable instructions + { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 }, + { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 }, + { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 }, + { X86::VPERMQZrik, X86::VPERMQZmik, 0 }, + { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 }, + { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE }, + { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 }, + { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 }, + { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 }, + { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 }, + { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 }, + { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE }, + { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 }, + { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 }, + { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 }, + { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 }, + { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 }, + { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 }, + { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 }, + + // AVX-512VL 256-bit masked foldable instructions + { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 }, + { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 }, + { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 }, + { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 }, + { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 }, + { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 }, + { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 }, + { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 }, + { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 }, + { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 }, + { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE }, + { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 }, + { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 }, + { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 }, + + // AVX-512VL 128-bit masked foldable instructions + { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 }, + { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 }, + { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE }, + { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE }, + { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 }, + { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 }, + { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 }, }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) { @@ -2008,47 +2630,348 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // Index 3, folded load Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } + auto I = X86InstrFMA3Info::rm_begin(); + auto E = X86InstrFMA3Info::rm_end(); + for (; I != E; ++I) { + if (!I.getGroup()->isKMasked()) { + // Intrinsic forms need to pass TB_NO_REVERSE. + if (I.getGroup()->isIntrinsic()) { + AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE); + } else { + AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD); + } + } + } static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { - // AVX-512 foldable instructions - { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + // AVX-512 foldable masked instructions { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, - { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, - { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, - { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, - { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, - { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 }, + { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 }, + { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 }, + { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 }, + { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 }, + { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 }, { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, - { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, - { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, - { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 }, + { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 }, + { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 }, + { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 }, + { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 }, + { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 }, + { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 }, + { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 }, + { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 }, + { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 }, { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, - // AVX-512{F,VL} foldable instructions 256-bit - { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, + { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 }, + { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 }, + { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, + { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, + { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, + { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, + { X86::VORPDZrrk, X86::VORPDZrmk, 0 }, + { X86::VORPSZrrk, X86::VORPSZrmk, 0 }, + { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 }, + { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 }, + { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 }, + { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 }, + { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 }, + { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 }, + { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 }, + { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 }, + { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 }, + { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 }, + { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 }, + { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 }, + { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 }, + { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 }, + { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 }, + { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 }, + { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 }, + { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 }, + { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 }, + { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 }, + { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 }, + { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 }, + { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 }, + { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 }, + { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 }, + { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 }, + { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 }, + { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 }, + { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 }, + { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 }, + { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 }, + { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 }, + { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 }, + { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 }, + { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 }, + { X86::VPORDZrrk, X86::VPORDZrmk, 0 }, + { X86::VPORQZrrk, X86::VPORQZrmk, 0 }, + { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 }, + { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 }, + { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 }, + { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 }, + { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 }, + { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 }, + { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 }, + { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 }, + { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 }, + { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 }, + { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 }, + { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 }, + { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 }, + { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 }, + { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 }, + { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 }, + { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 }, + { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 }, + { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 }, + { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 }, + { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, + { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, + { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 }, + { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 }, + { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 }, + { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 }, + { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 }, + { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, + + // AVX-512{F,VL} foldable masked instructions 256-bit { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, - { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, - { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, - { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, - { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, - { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, + { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, + { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 }, + { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 }, + { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 }, + { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 }, + { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 }, + { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 }, { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, - { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, - { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, - { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, + { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, + { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 }, + { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 }, + { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 }, + { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 }, + { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 }, + { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 }, { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, + { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, + { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 }, + { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 }, + { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, + { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, + { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, + { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, + { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 }, + { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 }, + { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 }, + { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 }, + { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 }, + { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 }, + { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 }, + { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 }, + { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 }, + { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 }, + { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 }, + { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 }, + { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 }, + { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 }, + { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, + { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 }, + { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 }, + { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 }, + { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 }, + { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 }, + { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 }, + { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 }, + { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 }, + { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 }, + { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 }, + { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 }, + { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 }, + { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 }, + { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 }, + { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 }, + { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 }, + { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 }, + { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 }, + { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 }, + { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 }, + { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 }, + { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 }, + { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 }, + { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 }, + { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 }, + { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 }, + { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 }, + { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 }, + { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 }, + { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 }, + { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 }, + { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 }, + { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 }, + { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 }, + { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 }, + { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 }, + { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 }, + { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 }, + { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 }, + { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 }, + { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 }, + { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 }, + { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 }, + { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 }, + { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 }, + { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, + { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, + { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 }, + { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 }, + { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 }, + { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 }, + { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 }, + { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 }, + // AVX-512{F,VL} foldable instructions 128-bit - { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, - { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, - { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, - { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, - { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, - { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, + { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, + { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 }, + { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 }, + { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 }, + { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 }, + { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 }, + { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 }, { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, - { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, - { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, + { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, + { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 }, + { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 }, + { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }, { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, - { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 } + { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 }, + { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 }, + { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, + { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, + { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, + { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, + { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 }, + { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 }, + { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 }, + { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 }, + { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 }, + { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 }, + { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 }, + { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 }, + { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 }, + { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 }, + { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 }, + { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 }, + { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 }, + { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 }, + { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, + { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 }, + { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 }, + { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 }, + { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 }, + { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 }, + { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 }, + { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 }, + { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 }, + { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 }, + { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 }, + { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 }, + { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 }, + { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 }, + { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 }, + { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 }, + { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, + { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, + { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, + { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 }, + { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 }, + { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 }, + { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 }, + { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 }, + { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 }, + { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 }, + { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 }, + { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 }, + { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 }, + { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 }, + { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 }, + { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 }, + { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 }, + { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 }, + { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 }, + { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 }, + { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 }, + { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 }, + { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 }, + { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 }, + { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 }, + { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 }, + { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, + { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, + { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 }, + { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 }, + { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 }, + { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 }, + { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 }, + { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 }, + + // 512-bit three source instructions with zero masking. + { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 }, + { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 }, + { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 }, + { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 }, + { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 }, + { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 }, + { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 }, + { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 }, + { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 }, + { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 }, + { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 }, + { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 }, + { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, + { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, + + // 256-bit three source instructions with zero masking. + { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 }, + { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 }, + { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 }, + { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 }, + { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 }, + { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 }, + { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 }, + { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 }, + { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 }, + { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 }, + { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 }, + { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 }, + { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, + { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, + + // 128-bit three source instructions with zero masking. + { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 }, + { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 }, + { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 }, + { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 }, + { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 }, + { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 }, + { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 }, + { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 }, + { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 }, + { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 }, + { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 }, + { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 }, + { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, + { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) { @@ -2057,21 +2980,35 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // Index 4, folded load Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD); } + for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) { + if (I.getGroup()->isKMasked()) { + // Intrinsics need to pass TB_NO_REVERSE. + if (I.getGroup()->isIntrinsic()) { + AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE); + } else { + AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD); + } + } + } } void X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable, MemOp2RegOpTableType &M2RTable, uint16_t RegOp, uint16_t MemOp, uint16_t Flags) { - if ((Flags & TB_NO_FORWARD) == 0) { - assert(!R2MTable.count(RegOp) && "Duplicate entry!"); - R2MTable[RegOp] = std::make_pair(MemOp, Flags); - } - if ((Flags & TB_NO_REVERSE) == 0) { - assert(!M2RTable.count(MemOp) && - "Duplicated entries in unfolding maps?"); - M2RTable[MemOp] = std::make_pair(RegOp, Flags); - } + if ((Flags & TB_NO_FORWARD) == 0) { + assert(!R2MTable.count(RegOp) && "Duplicate entry!"); + R2MTable[RegOp] = std::make_pair(MemOp, Flags); + } + if ((Flags & TB_NO_REVERSE) == 0) { + assert(!M2RTable.count(MemOp) && + "Duplicated entries in unfolding maps?"); + M2RTable[MemOp] = std::make_pair(RegOp, Flags); + } } bool @@ -2235,9 +3172,13 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::VMOVAPSZrm: case X86::VMOVAPSZ128rm: case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVUPSZrm: case X86::VMOVUPSZ128rm: case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZrm: case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: @@ -2305,9 +3246,13 @@ static bool isFrameStoreOpcode(int Opcode) { case X86::VMOVUPSZmr: case X86::VMOVUPSZ128mr: case X86::VMOVUPSZ256mr: + case X86::VMOVUPSZ128mr_NOVLX: + case X86::VMOVUPSZ256mr_NOVLX: case X86::VMOVAPSZmr: case X86::VMOVAPSZ128mr: case X86::VMOVAPSZ256mr: + case X86::VMOVAPSZ128mr_NOVLX: + case X86::VMOVAPSZ256mr_NOVLX: case X86::VMOVUPDZmr: case X86::VMOVUPDZ128mr: case X86::VMOVUPDZ256mr: @@ -2409,6 +3354,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, switch (MI.getOpcode()) { default: break; case X86::MOV8rm: + case X86::MOV8rm_NOREX: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: @@ -2418,6 +3364,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: + case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: case X86::VMOVSSrm: @@ -2425,25 +3372,27 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: + case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: + case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: - case X86::FsVMOVAPSrm: - case X86::FsVMOVAPDrm: - case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: // AVX-512 + case X86::VMOVSSZrm: + case X86::VMOVSDZrm: case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: case X86::VMOVAPDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVAPSZrm: case X86::VMOVDQA32Z128rm: case X86::VMOVDQA32Z256rm: @@ -2463,15 +3412,20 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case X86::VMOVDQU8Z128rm: case X86::VMOVDQU8Z256rm: case X86::VMOVDQU8Zrm: + case X86::VMOVUPDZ128rm: + case X86::VMOVUPDZ256rm: + case X86::VMOVUPDZrm: case X86::VMOVUPSZ128rm: case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. if (MI.getOperand(1 + X86::AddrBaseReg).isReg() && MI.getOperand(1 + X86::AddrScaleAmt).isImm() && MI.getOperand(1 + X86::AddrIndexReg).isReg() && MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && - MI.isInvariantLoad(AA)) { + MI.isDereferenceableInvariantLoad(AA)) { unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; @@ -2694,24 +3648,8 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, ImplicitOp.setImplicit(); NewSrc = getX86SubSuperRegister(Src.getReg(), 64); - MachineBasicBlock::LivenessQueryResult LQR = - MI.getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); - - switch (LQR) { - case MachineBasicBlock::LQR_Unknown: - // We can't give sane liveness flags to the instruction, abandon LEA - // formation. - return false; - case MachineBasicBlock::LQR_Live: - isKill = MI.killsRegister(SrcReg); - isUndef = false; - break; - default: - // The physreg itself is dead, so we have to use it as an <undef>. - isKill = false; - isUndef = true; - break; - } + isKill = Src.isKill(); + isUndef = Src.isUndef(); } else { // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. @@ -3079,7 +4017,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) .addOperand(Dest) .addOperand(Src), - MI.getOperand(2).getImm()); + MI.getOperand(2)); break; case X86::ADD32ri: case X86::ADD32ri8: @@ -3102,7 +4040,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); - NewMI = addOffset(MIB, MI.getOperand(2).getImm()); + NewMI = addOffset(MIB, MI.getOperand(2)); break; } case X86::ADD16ri: @@ -3116,7 +4054,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest) .addOperand(Src), - MI.getOperand(2).getImm()); + MI.getOperand(2)); break; } @@ -3133,156 +4071,236 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// Returns true if the given instruction opcode is FMA3. -/// Otherwise, returns false. -/// The second parameter is optional and is used as the second return from -/// the function. It is set to true if the given instruction has FMA3 opcode -/// that is used for lowering of scalar FMA intrinsics, and it is set to false -/// otherwise. -static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) { - if (IsIntrinsic) - *IsIntrinsic = false; +/// This determines which of three possible cases of a three source commute +/// the source indexes correspond to taking into account any mask operands. +/// All prevents commuting a passthru operand. Returns -1 if the commute isn't +/// possible. +/// Case 0 - Possible to commute the first and second operands. +/// Case 1 - Possible to commute the first and third operands. +/// Case 2 - Possible to commute the second and third operands. +static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, + unsigned SrcOpIdx2) { + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); - switch (Opcode) { - case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: - case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: - case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: - case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: - case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: - case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: - case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: - case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: - - case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: - case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: - case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: - case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: - case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: - case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: - case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: - case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: - - case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: - case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: - case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: - case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: - case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: - case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: - case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: - case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: - - case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: - case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: - case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: - case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: - case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: - case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: - case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: - case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: - - case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: - case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: - case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: - case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: - case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: - case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: - case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: - case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: - case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: - case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: - case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: - case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: - case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: - case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: - case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: - case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: - - case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: - case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: - case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: - case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: - case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: - case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: - case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: - case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: - - case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: - case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: - case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: - case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: - case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: - case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: - case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: - case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: - case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: - case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: - case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: - case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: - case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: - case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: - case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: - case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: - - case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: - case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: - case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: - case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: - case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: - case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: - case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: - case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: - - case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: - case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: - case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: - case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: - case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: - case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: - case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: - case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: - case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: - case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: - case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: - case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: - case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: - case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: - case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: - case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: - return true; + unsigned Op1 = 1, Op2 = 2, Op3 = 3; + if (X86II::isKMasked(TSFlags)) { + // The k-mask operand cannot be commuted. + if (SrcOpIdx1 == 2) + return -1; + + // For k-zero-masked operations it is Ok to commute the first vector + // operand. + // For regular k-masked operations a conservative choice is done as the + // elements of the first vector operand, for which the corresponding bit + // in the k-mask operand is set to 0, are copied to the result of the + // instruction. + // TODO/FIXME: The commute still may be legal if it is known that the + // k-mask operand is set to either all ones or all zeroes. + // It is also Ok to commute the 1st operand if all users of MI use only + // the elements enabled by the k-mask operand. For example, + // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i] + // : v1[i]; + // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 -> + // // Ok, to commute v1 in FMADD213PSZrk. + if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1) + return -1; + Op2++; + Op3++; + } + + if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2) + return 0; + if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3) + return 1; + if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3) + return 2; + return -1; +} - case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: - case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: - case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: - case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: - case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: - case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: - case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: - case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: - - case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: - case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: - case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: - case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: - case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: - case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: - case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: - case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: - - case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: - case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: - case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: - case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: - case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: - case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: - case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: - case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: - if (IsIntrinsic) - *IsIntrinsic = true; - return true; - default: - return false; +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( + const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const { + + unsigned Opc = MI.getOpcode(); + + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); + + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1) + return 0; + + // Determine which case this commute is or if it can't be done. + int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2); + if (Case < 0) + return 0; + + // Define the FMA forms mapping array that helps to map input FMA form + // to output FMA form to preserve the operation semantics after + // commuting the operands. + const unsigned Form132Index = 0; + const unsigned Form213Index = 1; + const unsigned Form231Index = 2; + static const unsigned FormMapping[][3] = { + // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; + // FMA132 A, C, b; ==> FMA231 C, A, b; + // FMA213 B, A, c; ==> FMA213 A, B, c; + // FMA231 C, A, b; ==> FMA132 A, C, b; + { Form231Index, Form213Index, Form132Index }, + // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; + // FMA132 A, c, B; ==> FMA132 B, c, A; + // FMA213 B, a, C; ==> FMA231 C, a, B; + // FMA231 C, a, B; ==> FMA213 B, a, C; + { Form132Index, Form231Index, Form213Index }, + // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; + // FMA132 a, C, B; ==> FMA213 a, B, C; + // FMA213 b, A, C; ==> FMA132 b, C, A; + // FMA231 c, A, B; ==> FMA231 c, B, A; + { Form213Index, Form132Index, Form231Index } + }; + + unsigned FMAForms[3]; + if (FMA3Group.isRegOpcodeFromGroup(Opc)) { + FMAForms[0] = FMA3Group.getReg132Opcode(); + FMAForms[1] = FMA3Group.getReg213Opcode(); + FMAForms[2] = FMA3Group.getReg231Opcode(); + } else { + FMAForms[0] = FMA3Group.getMem132Opcode(); + FMAForms[1] = FMA3Group.getMem213Opcode(); + FMAForms[2] = FMA3Group.getMem231Opcode(); + } + unsigned FormIndex; + for (FormIndex = 0; FormIndex < 3; FormIndex++) + if (Opc == FMAForms[FormIndex]) + break; + + // Everything is ready, just adjust the FMA opcode and return it. + FormIndex = FormMapping[Case][FormIndex]; + return FMAForms[FormIndex]; +} + +static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, + unsigned SrcOpIdx2) { + uint64_t TSFlags = MI.getDesc().TSFlags; + + // Determine which case this commute is or if it can't be done. + int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2); + if (Case < 0) + return false; + + // For each case we need to swap two pairs of bits in the final immediate. + static const uint8_t SwapMasks[3][4] = { + { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5. + { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6. + { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6. + }; + + uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm(); + // Clear out the bits we are swapping. + uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] | + SwapMasks[Case][2] | SwapMasks[Case][3]); + // If the immediate had a bit of the pair set, then set the opposite bit. + if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1]; + if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0]; + if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3]; + if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2]; + MI.getOperand(MI.getNumOperands()-1).setImm(NewImm); + + return true; +} + +// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be +// commuted. +static bool isCommutableVPERMV3Instruction(unsigned Opcode) { +#define VPERM_CASES(Suffix) \ + case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \ + case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \ + case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \ + case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \ + case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \ + case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \ + case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \ + case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \ + case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \ + case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \ + case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \ + case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz: + +#define VPERM_CASES_BROADCAST(Suffix) \ + VPERM_CASES(Suffix) \ + case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \ + case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \ + case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \ + case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \ + case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \ + case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz: + + switch (Opcode) { + default: return false; + VPERM_CASES(B) + VPERM_CASES_BROADCAST(D) + VPERM_CASES_BROADCAST(PD) + VPERM_CASES_BROADCAST(PS) + VPERM_CASES_BROADCAST(Q) + VPERM_CASES(W) + return true; } - llvm_unreachable("Opcode not handled by the switch"); +#undef VPERM_CASES_BROADCAST +#undef VPERM_CASES +} + +// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching +// from the I opcod to the T opcode and vice versa. +static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) { +#define VPERM_CASES(Orig, New) \ + case X86::Orig##128rr: return X86::New##128rr; \ + case X86::Orig##128rrkz: return X86::New##128rrkz; \ + case X86::Orig##128rm: return X86::New##128rm; \ + case X86::Orig##128rmkz: return X86::New##128rmkz; \ + case X86::Orig##256rr: return X86::New##256rr; \ + case X86::Orig##256rrkz: return X86::New##256rrkz; \ + case X86::Orig##256rm: return X86::New##256rm; \ + case X86::Orig##256rmkz: return X86::New##256rmkz; \ + case X86::Orig##rr: return X86::New##rr; \ + case X86::Orig##rrkz: return X86::New##rrkz; \ + case X86::Orig##rm: return X86::New##rm; \ + case X86::Orig##rmkz: return X86::New##rmkz; + +#define VPERM_CASES_BROADCAST(Orig, New) \ + VPERM_CASES(Orig, New) \ + case X86::Orig##128rmb: return X86::New##128rmb; \ + case X86::Orig##128rmbkz: return X86::New##128rmbkz; \ + case X86::Orig##256rmb: return X86::New##256rmb; \ + case X86::Orig##256rmbkz: return X86::New##256rmbkz; \ + case X86::Orig##rmb: return X86::New##rmb; \ + case X86::Orig##rmbkz: return X86::New##rmbkz; + + switch (Opcode) { + VPERM_CASES(VPERMI2B, VPERMT2B) + VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D) + VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD) + VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS) + VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q) + VPERM_CASES(VPERMI2W, VPERMT2W) + VPERM_CASES(VPERMT2B, VPERMI2B) + VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D) + VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD) + VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS) + VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q) + VPERM_CASES(VPERMT2W, VPERMI2W) + } + + llvm_unreachable("Unreachable!"); +#undef VPERM_CASES_BROADCAST +#undef VPERM_CASES } MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, @@ -3352,6 +4370,39 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::MOVSDrr: + case X86::MOVSSrr: + case X86::VMOVSDrr: + case X86::VMOVSSrr:{ + // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. + if (!Subtarget.hasSSE41()) + return nullptr; + + unsigned Mask, Opc; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; + case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; + case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; + case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + } + + // MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy + // this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS. + auto &MRI = MI.getParent()->getParent()->getRegInfo(); + auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg()); + unsigned VR128 = MRI.createVirtualRegister(VR128RC); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY), + VR128) + .addReg(MI.getOperand(2).getReg()); + + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + WorkingMI.getOperand(2).setReg(VR128); + WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] @@ -3364,12 +4415,24 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::CMPSDrr: + case X86::CMPSSrr: case X86::CMPPDrri: case X86::CMPPSrri: + case X86::VCMPSDrr: + case X86::VCMPSSrr: case X86::VCMPPDrri: case X86::VCMPPSrri: case X86::VCMPPDYrri: - case X86::VCMPPSYrri: { + case X86::VCMPPSYrri: + case X86::VCMPSDZrr: + case X86::VCMPSSZrr: + case X86::VCMPPDZrri: + case X86::VCMPPSZrri: + case X86::VCMPPDZ128rri: + case X86::VCMPPSZ128rri: + case X86::VCMPPDZ256rri: + case X86::VCMPPSZ256rri: { // Float comparison can be safely commuted for // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI.getOperand(3).getImm() & 0x7; @@ -3383,6 +4446,37 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return nullptr; } } + case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri: + case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri: + case X86::VPCMPBZrri: case X86::VPCMPUBZrri: + case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri: + case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri: + case X86::VPCMPDZrri: case X86::VPCMPUDZrri: + case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri: + case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri: + case X86::VPCMPQZrri: case X86::VPCMPUQZrri: + case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri: + case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri: + case X86::VPCMPWZrri: case X86::VPCMPUWZrri: { + // Flip comparison mode immediate (if necessary). + unsigned Imm = MI.getOperand(3).getImm() & 0x7; + switch (Imm) { + default: llvm_unreachable("Unreachable!"); + case 0x01: Imm = 0x06; break; // LT -> NLE + case 0x02: Imm = 0x05; break; // LE -> NLT + case 0x05: Imm = 0x02; break; // NLT -> LE + case 0x06: Imm = 0x01; break; // NLE -> LT + case 0x00: // EQ + case 0x03: // FALSE + case 0x04: // NE + case 0x07: // TRUE + break; + } + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(3).setImm(Imm); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::VPCOMBri: case X86::VPCOMUBri: case X86::VPCOMDri: case X86::VPCOMUDri: case X86::VPCOMQri: case X86::VPCOMUQri: @@ -3390,6 +4484,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // Flip comparison mode immediate (if necessary). unsigned Imm = MI.getOperand(3).getImm() & 0x7; switch (Imm) { + default: llvm_unreachable("Unreachable!"); case 0x00: Imm = 0x02; break; // LT -> GT case 0x01: Imm = 0x03; break; // LE -> GE case 0x02: Imm = 0x00; break; // GT -> LT @@ -3398,7 +4493,6 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case 0x05: // NE case 0x06: // FALSE case 0x07: // TRUE - default: break; } auto &WorkingMI = cloneIfNew(MI); @@ -3417,6 +4511,22 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::MOVHLPSrr: + case X86::UNPCKHPDrr: { + if (!Subtarget.hasSSE2()) + return nullptr; + + unsigned Opc = MI.getOpcode(); + switch (Opc) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break; + case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break; + } + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: @@ -3490,9 +4600,44 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } - default: - if (isFMA3(MI.getOpcode())) { - unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: + case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: + case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: + case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: + case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: + case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: + case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik: + case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik: + case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik: + case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik: + case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik: + case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik: + case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: + case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: + case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: + case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: + case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: + case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: { + auto &WorkingMI = cloneIfNew(MI); + if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2)) + return nullptr; + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + default: { + if (isCommutableVPERMV3Instruction(MI.getOpcode())) { + unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode()); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + + const X86InstrFMA3Group *FMA3Group = + X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); + if (FMA3Group) { + unsigned Opc = + getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group); if (Opc == 0) return nullptr; auto &WorkingMI = cloneIfNew(MI); @@ -3503,22 +4648,54 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } + } } -bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { +bool X86InstrInfo::findFMA3CommutedOpIndices( + const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const { - unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2)) + return false; + + // Check if we can adjust the opcode to preserve the semantics when + // commute the register operands. + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0; +} + +bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + uint64_t TSFlags = MI.getDesc().TSFlags; + + unsigned FirstCommutableVecOp = 1; + unsigned LastCommutableVecOp = 3; + unsigned KMaskOp = 0; + if (X86II::isKMasked(TSFlags)) { + // The k-mask operand has index = 2 for masked and zero-masked operations. + KMaskOp = 2; + + // The operand with index = 1 is used as a source for those elements for + // which the corresponding bit in the k-mask is set to 0. + if (X86II::isKMergeMasked(TSFlags)) + FirstCommutableVecOp = 3; + + LastCommutableVecOp++; + } + + if (isMem(MI, LastCommutableVecOp)) + LastCommutableVecOp--; // Only the first RegOpsNum operands are commutable. // Also, the value 'CommuteAnyOperandIndex' is valid here as it means // that the operand is not specified/fixed. if (SrcOpIdx1 != CommuteAnyOperandIndex && - (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum)) + (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp || + SrcOpIdx1 == KMaskOp)) return false; if (SrcOpIdx2 != CommuteAnyOperandIndex && - (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum)) + (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp || + SrcOpIdx2 == KMaskOp)) return false; // Look for two different register operands assumed to be commutable @@ -3533,7 +4710,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, if (SrcOpIdx1 == SrcOpIdx2) // Both of operands are not fixed. By default set one of commutable // operands to the last register operand of the instruction. - CommutableOpIdx2 = RegOpsNum; + CommutableOpIdx2 = LastCommutableVecOp; else if (SrcOpIdx2 == CommuteAnyOperandIndex) // Only one of operands is not fixed. CommutableOpIdx2 = SrcOpIdx1; @@ -3541,7 +4718,12 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); - for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + for (CommutableOpIdx1 = LastCommutableVecOp; + CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) { + // Just ignore and skip the k-mask operand. + if (CommutableOpIdx1 == KMaskOp) + continue; + // The commuted operands must have different registers. // Otherwise, the commute transformation does not change anything and // is useless then. @@ -3550,7 +4732,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, } // No appropriate commutable operands were found. - if (CommutableOpIdx1 == 0) + if (CommutableOpIdx1 < FirstCommutableVecOp) return false; // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 @@ -3560,208 +4742,34 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, return false; } - // Check if we can adjust the opcode to preserve the semantics when - // commute the register operands. - return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; -} - -unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( - MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2) const { - unsigned Opc = MI.getOpcode(); - - // Define the array that holds FMA opcodes in groups - // of 3 opcodes(132, 213, 231) in each group. - static const uint16_t RegularOpcodeGroups[][3] = { - { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, - { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, - { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, - { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r }, - { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY }, - { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY }, - { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m }, - { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m }, - { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m }, - { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m }, - { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY }, - { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY }, - - { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r }, - { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r }, - { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r }, - { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r }, - { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY }, - { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY }, - { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m }, - { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m }, - { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m }, - { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, - { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, - { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, - - { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, - { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, - { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, - { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r }, - { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY }, - { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY }, - { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m }, - { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m }, - { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m }, - { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m }, - { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY }, - { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY }, - - { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r }, - { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r }, - { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r }, - { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r }, - { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY }, - { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY }, - { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m }, - { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m }, - { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m }, - { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m }, - { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY }, - { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY }, - - { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r }, - { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r }, - { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY }, - { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY }, - { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m }, - { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m }, - { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY }, - { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY }, - - { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r }, - { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r }, - { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY }, - { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY }, - { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m }, - { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m }, - { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, - { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } - }; - - // Define the array that holds FMA*_Int opcodes in groups - // of 3 opcodes(132, 213, 231) in each group. - static const uint16_t IntrinOpcodeGroups[][3] = { - { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, - { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, - { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, - { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, - - { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, - { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, - { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, - { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, - - { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, - { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, - { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, - { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, - - { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, - { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, - { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, - { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, - }; - - const unsigned Form132Index = 0; - const unsigned Form213Index = 1; - const unsigned Form231Index = 2; - const unsigned FormsNum = 3; - - bool IsIntrinOpcode; - isFMA3(Opc, &IsIntrinOpcode); - - size_t GroupsNum; - const uint16_t (*OpcodeGroups)[3]; - if (IsIntrinOpcode) { - GroupsNum = array_lengthof(IntrinOpcodeGroups); - OpcodeGroups = IntrinOpcodeGroups; - } else { - GroupsNum = array_lengthof(RegularOpcodeGroups); - OpcodeGroups = RegularOpcodeGroups; - } - - const uint16_t *FoundOpcodesGroup = nullptr; - size_t FormIndex; - - // Look for the input opcode in the corresponding opcodes table. - for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; - ++GroupIndex) { - for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { - if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { - FoundOpcodesGroup = OpcodeGroups[GroupIndex]; - break; - } - } - } - - // The input opcode does not match with any of the opcodes from the tables. - // The unsupported FMA opcode must be added to one of the two opcode groups - // defined above. - assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode"); - - // Put the lowest index to SrcOpIdx1 to simplify the checks below. - if (SrcOpIdx1 > SrcOpIdx2) - std::swap(SrcOpIdx1, SrcOpIdx2); - - // TODO: Commuting the 1st operand of FMA*_Int requires some additional - // analysis. The commute optimization is legal only if all users of FMA*_Int - // use only the lowest element of the FMA*_Int instruction. Such analysis are - // not implemented yet. So, just return 0 in that case. - // When such analysis are available this place will be the right place for - // calling it. - if (IsIntrinOpcode && SrcOpIdx1 == 1) - return 0; - - unsigned Case; - if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) - Case = 0; - else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) - Case = 1; - else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) - Case = 2; - else - return 0; - - // Define the FMA forms mapping array that helps to map input FMA form - // to output FMA form to preserve the operation semantics after - // commuting the operands. - static const unsigned FormMapping[][3] = { - // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; - // FMA132 A, C, b; ==> FMA231 C, A, b; - // FMA213 B, A, c; ==> FMA213 A, B, c; - // FMA231 C, A, b; ==> FMA132 A, C, b; - { Form231Index, Form213Index, Form132Index }, - // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; - // FMA132 A, c, B; ==> FMA132 B, c, A; - // FMA213 B, a, C; ==> FMA231 C, a, B; - // FMA231 C, a, B; ==> FMA213 B, a, C; - { Form132Index, Form231Index, Form213Index }, - // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; - // FMA132 a, C, B; ==> FMA213 a, B, C; - // FMA213 b, A, C; ==> FMA132 b, C, A; - // FMA231 c, A, B; ==> FMA231 c, B, A; - { Form213Index, Form132Index, Form231Index } - }; - - // Everything is ready, just adjust the FMA opcode and return it. - FormIndex = FormMapping[Case][FormIndex]; - return FoundOpcodesGroup[FormIndex]; + return true; } bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { + const MCInstrDesc &Desc = MI.getDesc(); + if (!Desc.isCommutable()) + return false; + switch (MI.getOpcode()) { + case X86::CMPSDrr: + case X86::CMPSSrr: case X86::CMPPDrri: case X86::CMPPSrri: + case X86::VCMPSDrr: + case X86::VCMPSSrr: case X86::VCMPPDrri: case X86::VCMPPSrri: case X86::VCMPPDYrri: - case X86::VCMPPSYrri: { + case X86::VCMPPSYrri: + case X86::VCMPSDZrr: + case X86::VCMPSSZrr: + case X86::VCMPPDZrri: + case X86::VCMPPSZrri: + case X86::VCMPPDZ128rri: + case X86::VCMPPSZ128rri: + case X86::VCMPPDZ256rri: + case X86::VCMPPSZ256rri: { // Float comparison can be safely commuted for // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI.getOperand(3).getImm() & 0x7; @@ -3776,9 +4784,73 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, } return false; } + case X86::MOVSDrr: + case X86::MOVSSrr: + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + if (Subtarget.hasSSE41()) + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return false; + } + case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: + case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: + case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: + case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: + case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: + case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: + case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik: + case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik: + case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik: + case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik: + case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik: + case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik: + case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: + case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: + case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: + case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: + case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: + case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: + return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); default: - if (isFMA3(MI.getOpcode())) - return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + const X86InstrFMA3Group *FMA3Group = + X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); + if (FMA3Group) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group); + + // Handled masked instructions since we need to skip over the mask input + // and the preserved input. + if (Desc.TSFlags & X86II::EVEX_K) { + // First assume that the first input is the mask operand and skip past it. + unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1; + unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2; + // Check if the first input is tied. If there isn't one then we only + // need to skip the mask operand which we did above. + if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(), + MCOI::TIED_TO) != -1)) { + // If this is zero masking instruction with a tied operand, we need to + // move the first index back to the first input since this must + // be a 3 input instruction and we want the first two non-mask inputs. + // Otherwise this is a 2 input instruction with a preserved input and + // mask, so we need to move the indices to skip one more input. + if (Desc.TSFlags & X86II::EVEX_Z) + --CommutableOpIdx1; + else { + ++CommutableOpIdx1; + ++CommutableOpIdx2; + } + } + + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + + if (!MI.getOperand(SrcOpIdx1).isReg() || + !MI.getOperand(SrcOpIdx2).isReg()) + // No idea. + return false; + return true; + } + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } return false; @@ -4296,7 +5368,10 @@ bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, return true; } -unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { +unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved) const { + assert(!BytesRemoved && "code size not handled"); + MachineBasicBlock::iterator I = MBB.end(); unsigned Count = 0; @@ -4316,15 +5391,17 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return Count; } -unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, +unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - const DebugLoc &DL) const { + const DebugLoc &DL, + int *BytesAdded) const { // Shouldn't be a fall through. - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert(TBB && "insertBranch must not be told to insert a fallthrough"); assert((Cond.size() == 1 || Cond.size() == 0) && "X86 branch conditions have one component!"); + assert(!BytesAdded && "code size not handled"); if (Cond.empty()) { // Unconditional branch? @@ -4430,16 +5507,63 @@ static bool isHReg(unsigned Reg) { } // Try and copy between VR128/VR64 and GR64 registers. -static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, +static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg, const X86Subtarget &Subtarget) { + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); + + // SrcReg(MaskReg) -> DestReg(GR64) + // SrcReg(MaskReg) -> DestReg(GR32) + // SrcReg(MaskReg) -> DestReg(GR16) + // SrcReg(MaskReg) -> DestReg(GR8) + + // All KMASK RegClasses hold the same k registers, can be tested against anyone. + if (X86::VK16RegClass.contains(SrcReg)) { + if (X86::GR64RegClass.contains(DestReg)) { + assert(Subtarget.hasBWI()); + return X86::KMOVQrk; + } + if (X86::GR32RegClass.contains(DestReg)) + return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk; + if (X86::GR16RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVWrk; + } + if (X86::GR8RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk; + } + } + + // SrcReg(GR64) -> DestReg(MaskReg) + // SrcReg(GR32) -> DestReg(MaskReg) + // SrcReg(GR16) -> DestReg(MaskReg) + // SrcReg(GR8) -> DestReg(MaskReg) + + // All KMASK RegClasses hold the same k registers, can be tested against anyone. + if (X86::VK16RegClass.contains(DestReg)) { + if (X86::GR64RegClass.contains(SrcReg)) { + assert(Subtarget.hasBWI()); + return X86::KMOVQkr; + } + if (X86::GR32RegClass.contains(SrcReg)) + return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr; + if (X86::GR16RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVWkr; + } + if (X86::GR8RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr; + } + } + // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) // SrcReg(GR64) -> DestReg(VR128) // SrcReg(GR64) -> DestReg(VR64) - bool HasAVX = Subtarget.hasAVX(); - bool HasAVX512 = Subtarget.hasAVX512(); if (X86::GR64RegClass.contains(DestReg)) { if (X86::VR128XRegClass.contains(SrcReg)) // Copy from a VR128 register to a GR64 register. @@ -4479,96 +5603,13 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } -static bool isMaskRegClass(const TargetRegisterClass *RC) { - // All KMASK RegClasses hold the same k registers, can be tested against anyone. - return X86::VK16RegClass.hasSubClassEq(RC); -} - -static bool MaskRegClassContains(unsigned Reg) { - // All KMASK RegClasses hold the same k registers, can be tested against anyone. - return X86::VK16RegClass.contains(Reg); -} - -static bool GRRegClassContains(unsigned Reg) { - return X86::GR64RegClass.contains(Reg) || - X86::GR32RegClass.contains(Reg) || - X86::GR16RegClass.contains(Reg) || - X86::GR8RegClass.contains(Reg); -} -static -unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) { - if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) { - DestReg = getX86SubSuperRegister(DestReg, 32); - return X86::KMOVBrk; - } - if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) { - SrcReg = getX86SubSuperRegister(SrcReg, 32); - return X86::KMOVBkr; - } - return 0; -} - -static -unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) { - if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg)) - return X86::KMOVQkk; - if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg)) - return X86::KMOVDrk; - if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg)) - return X86::KMOVQrk; - if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg)) - return X86::KMOVDkr; - if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg)) - return X86::KMOVQkr; - return 0; -} - -static -unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, - const X86Subtarget &Subtarget) -{ - if (Subtarget.hasDQI()) - if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg)) - return Opc; - if (Subtarget.hasBWI()) - if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) - return Opc; - if (X86::VR128XRegClass.contains(DestReg, SrcReg)) { - if (Subtarget.hasVLX()) - return X86::VMOVAPSZ128rr; - DestReg = get512BitSuperRegister(DestReg); - SrcReg = get512BitSuperRegister(SrcReg); - return X86::VMOVAPSZrr; - } - if (X86::VR256XRegClass.contains(DestReg, SrcReg)) { - if (Subtarget.hasVLX()) - return X86::VMOVAPSZ256rr; - DestReg = get512BitSuperRegister(DestReg); - SrcReg = get512BitSuperRegister(SrcReg); - return X86::VMOVAPSZrr; - } - if (X86::VR512RegClass.contains(DestReg, SrcReg)) - return X86::VMOVAPSZrr; - if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) - return X86::KMOVWkk; - if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { - SrcReg = getX86SubSuperRegister(SrcReg, 32); - return X86::KMOVWkr; - } - if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) { - DestReg = getX86SubSuperRegister(DestReg, 32); - return X86::KMOVWrk; - } - return 0; -} - void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. bool HasAVX = Subtarget.hasAVX(); - bool HasAVX512 = Subtarget.hasAVX512(); + bool HasVLX = Subtarget.hasVLX(); unsigned Opc = 0; if (X86::GR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV64rr; @@ -4590,12 +5631,41 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; - else if (HasAVX512) - Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget); - else if (X86::VR128RegClass.contains(DestReg, SrcReg)) - Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; - else if (X86::VR256RegClass.contains(DestReg, SrcReg)) - Opc = X86::VMOVAPSYrr; + else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) { + if (HasVLX) + Opc = X86::VMOVAPSZ128rr; + else if (X86::VR128RegClass.contains(DestReg, SrcReg)) + Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; + else { + // If this an extended register and we don't have VLX we need to use a + // 512-bit move. + Opc = X86::VMOVAPSZrr; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, + &X86::VR512RegClass); + SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, + &X86::VR512RegClass); + } + } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) { + if (HasVLX) + Opc = X86::VMOVAPSZ256rr; + else if (X86::VR256RegClass.contains(DestReg, SrcReg)) + Opc = X86::VMOVAPSYrr; + else { + // If this an extended register and we don't have VLX we need to use a + // 512-bit move. + Opc = X86::VMOVAPSZrr; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, + &X86::VR512RegClass); + SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, + &X86::VR512RegClass); + } + } else if (X86::VR512RegClass.contains(DestReg, SrcReg)) + Opc = X86::VMOVAPSZrr; + // All KMASK RegClasses hold the same k registers, can be tested against anyone. + else if (X86::VK16RegClass.contains(DestReg, SrcReg)) + Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk; if (!Opc) Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); @@ -4708,37 +5778,15 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Cannot emit physreg copy instruction"); } -static unsigned getLoadStoreMaskRegOpcode(const TargetRegisterClass *RC, - bool load) { - switch (RC->getSize()) { - default: - llvm_unreachable("Unknown spill size"); - case 2: - return load ? X86::KMOVWkm : X86::KMOVWmk; - case 4: - return load ? X86::KMOVDkm : X86::KMOVDmk; - case 8: - return load ? X86::KMOVQkm : X86::KMOVQmk; - } -} - static unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC, bool isStackAligned, const X86Subtarget &STI, bool load) { - if (STI.hasAVX512()) { - if (isMaskRegClass(RC)) - return getLoadStoreMaskRegOpcode(RC, load); - if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) - return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; - if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) - return load ? X86::VMOVSDZrm : X86::VMOVSDZmr; - if (X86::VR512RegClass.hasSubClassEq(RC)) - return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; - } - bool HasAVX = STI.hasAVX(); + bool HasAVX512 = STI.hasAVX512(); + bool HasVLX = STI.hasVLX(); + switch (RC->getSize()) { default: llvm_unreachable("Unknown spill size"); @@ -4751,69 +5799,85 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; return load ? X86::MOV8rm : X86::MOV8mr; case 2: + if (X86::VK16RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVWkm : X86::KMOVWmk; assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); return load ? X86::MOV16rm : X86::MOV16mr; case 4: if (X86::GR32RegClass.hasSubClassEq(RC)) return load ? X86::MOV32rm : X86::MOV32mr; - if (X86::FR32RegClass.hasSubClassEq(RC)) + if (X86::FR32XRegClass.hasSubClassEq(RC)) return load ? - (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : - (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : + (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); if (X86::RFP32RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp32m : X86::ST_Fp32m; + if (X86::VK32RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVDkm : X86::KMOVDmk; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) return load ? X86::MOV64rm : X86::MOV64mr; - if (X86::FR64RegClass.hasSubClassEq(RC)) + if (X86::FR64XRegClass.hasSubClassEq(RC)) return load ? - (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : - (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : + (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); if (X86::VR64RegClass.hasSubClassEq(RC)) return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; if (X86::RFP64RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp64m : X86::ST_Fp64m; + if (X86::VK64RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVQkm : X86::KMOVQmk; llvm_unreachable("Unknown 8-byte regclass"); case 10: assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { - assert((X86::VR128RegClass.hasSubClassEq(RC) || - X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass"); + assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. - if (X86::VR128RegClass.hasSubClassEq(RC)) { - if (isStackAligned) - return load ? (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) - : (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); - else - return load ? (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) - : (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); - } - assert(STI.hasVLX() && "Using extended register requires VLX"); if (isStackAligned) - return load ? X86::VMOVAPSZ128rm : X86::VMOVAPSZ128mr; + return load ? + (HasVLX ? X86::VMOVAPSZ128rm : + HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : + HasAVX ? X86::VMOVAPSrm : + X86::MOVAPSrm): + (HasVLX ? X86::VMOVAPSZ128mr : + HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX : + HasAVX ? X86::VMOVAPSmr : + X86::MOVAPSmr); else - return load ? X86::VMOVUPSZ128rm : X86::VMOVUPSZ128mr; + return load ? + (HasVLX ? X86::VMOVUPSZ128rm : + HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX : + HasAVX ? X86::VMOVUPSrm : + X86::MOVUPSrm): + (HasVLX ? X86::VMOVUPSZ128mr : + HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : + HasAVX ? X86::VMOVUPSmr : + X86::MOVUPSmr); } case 32: - assert((X86::VR256RegClass.hasSubClassEq(RC) || - X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass"); + assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. - if (X86::VR256RegClass.hasSubClassEq(RC)) { - if (isStackAligned) - return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; - else - return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; - } - assert(STI.hasVLX() && "Using extended register requires VLX"); if (isStackAligned) - return load ? X86::VMOVAPSZ256rm : X86::VMOVAPSZ256mr; + return load ? + (HasVLX ? X86::VMOVAPSZ256rm : + HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX : + X86::VMOVAPSYrm) : + (HasVLX ? X86::VMOVAPSZ256mr : + HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX : + X86::VMOVAPSYmr); else - return load ? X86::VMOVUPSZ256rm : X86::VMOVUPSZ256mr; + return load ? + (HasVLX ? X86::VMOVUPSZ256rm : + HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX : + X86::VMOVUPSYrm) : + (HasVLX ? X86::VMOVUPSZ256mr : + HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX : + X86::VMOVUPSYmr); case 64: assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); - assert(STI.hasVLX() && "Using 512-bit register requires AVX512"); + assert(STI.hasAVX512() && "Using 512-bit register requires AVX512"); if (isStackAligned) return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; else @@ -4851,8 +5915,7 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, Offset = DispMO.getImm(); - return MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() == - X86::NoRegister; + return true; } static unsigned getStoreRegOpcode(unsigned SrcReg, @@ -4876,7 +5939,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); - assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && + assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = @@ -4954,6 +6017,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: + if (!MI.getOperand(1).isImm()) + return false; SrcReg = MI.getOperand(0).getReg(); SrcReg2 = 0; CmpMask = ~0; @@ -4985,6 +6050,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: + if (!MI.getOperand(2).isImm()) + return false; SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; @@ -5263,9 +6330,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // If the definition is in this basic block, RE points to the definition; // otherwise, RE is the rend of the basic block. MachineBasicBlock::reverse_iterator - RI = MachineBasicBlock::reverse_iterator(I), + RI = ++I.getReverse(), RE = CmpInstr.getParent() == MI->getParent() - ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ + ? Def.getReverse() /* points to MI */ : CmpInstr.getParent()->rend(); MachineInstr *Movr0Inst = nullptr; for (; RI != RE; ++RI) { @@ -5411,9 +6478,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (Movr0Inst) { // Look backwards until we find a def that doesn't use the current EFLAGS. Def = Sub; - MachineBasicBlock::reverse_iterator - InsertI = MachineBasicBlock::reverse_iterator(++Def), - InsertE = Sub->getParent()->rend(); + MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(), + InsertE = Sub->getParent()->rend(); for (; InsertI != InsertE; ++InsertI) { MachineInstr *Instr = &*InsertI; if (!Instr->readsRegister(X86::EFLAGS, TRI) && @@ -5455,14 +6521,6 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const { - if (FoldAsLoadDefReg == 0) - return nullptr; - // To be conservative, if there exists another load, clear the load candidate. - if (MI.mayLoad()) { - FoldAsLoadDefReg = 0; - return nullptr; - } - // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); @@ -5471,27 +6529,24 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, return nullptr; // Collect information about virtual register operands of MI. - unsigned SrcOperandId = 0; - bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) { + SmallVector<unsigned, 1> SrcOperandIds; + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg != FoldAsLoadDefReg) continue; - // Do not fold if we have a subreg use or a def or multiple uses. - if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) + // Do not fold if we have a subreg use or a def. + if (MO.getSubReg() || MO.isDef()) return nullptr; - - SrcOperandId = i; - FoundSrcOperand = true; + SrcOperandIds.push_back(i); } - if (!FoundSrcOperand) + if (SrcOperandIds.empty()) return nullptr; // Check whether we can fold the def into SrcOperandId. - if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -5553,7 +6608,9 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, return true; } -bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { +static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, + const TargetInstrInfo &TII, + const X86Subtarget &Subtarget) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); int64_t Imm = MIB->getOperand(1).getImm(); @@ -5570,23 +6627,23 @@ bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { X86MachineFunctionInfo *X86FI = MBB.getParent()->getInfo<X86MachineFunctionInfo>(); if (X86FI->getUsesRedZone()) { - MIB->setDesc(get(MIB->getOpcode() == X86::MOV32ImmSExti8 ? X86::MOV32ri - : X86::MOV64ri)); + MIB->setDesc(TII.get(MIB->getOpcode() == + X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri)); return true; } // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and // widen the register if necessary. StackAdjustment = 8; - BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); - MIB->setDesc(get(X86::POP64r)); + BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm); + MIB->setDesc(TII.get(X86::POP64r)); MIB->getOperand(0) .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); } else { assert(MIB->getOpcode() == X86::MOV32ImmSExti8); StackAdjustment = 4; - BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); - MIB->setDesc(get(X86::POP32r)); + BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm); + MIB->setDesc(TII.get(X86::POP32r)); } // Build CFI if necessary. @@ -5616,7 +6673,9 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, unsigned Reg = MIB->getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); - auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + auto Flags = MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8); MachineBasicBlock::iterator I = MIB.getInstr(); @@ -5629,6 +6688,53 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); } +// This is used to handle spills for 128/256-bit registers when we have AVX512, +// but not VLX. If it uses an extended register we need to use an instruction +// that loads the lower 128/256-bit, but is available with only AVX512F. +static bool expandNOVLXLoad(MachineInstrBuilder &MIB, + const TargetRegisterInfo *TRI, + const MCInstrDesc &LoadDesc, + const MCInstrDesc &BroadcastDesc, + unsigned SubIdx) { + unsigned DestReg = MIB->getOperand(0).getReg(); + // Check if DestReg is XMM16-31 or YMM16-31. + if (TRI->getEncodingValue(DestReg) < 16) { + // We can use a normal VEX encoded load. + MIB->setDesc(LoadDesc); + } else { + // Use a 128/256-bit VBROADCAST instruction. + MIB->setDesc(BroadcastDesc); + // Change the destination to a 512-bit register. + DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass); + MIB->getOperand(0).setReg(DestReg); + } + return true; +} + +// This is used to handle spills for 128/256-bit registers when we have AVX512, +// but not VLX. If it uses an extended register we need to use an instruction +// that stores the lower 128/256-bit, but is available with only AVX512F. +static bool expandNOVLXStore(MachineInstrBuilder &MIB, + const TargetRegisterInfo *TRI, + const MCInstrDesc &StoreDesc, + const MCInstrDesc &ExtractDesc, + unsigned SubIdx) { + unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg(); + // Check if DestReg is XMM16-31 or YMM16-31. + if (TRI->getEncodingValue(SrcReg) < 16) { + // We can use a normal VEX encoded store. + MIB->setDesc(StoreDesc); + } else { + // Use a VEXTRACTF instruction. + MIB->setDesc(ExtractDesc); + // Change the destination to a 512-bit register. + SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass); + MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg); + MIB.addImm(0x0); // Append immediate to extract from the lower bits. + } + + return true; +} bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); @@ -5641,7 +6747,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); case X86::MOV32ImmSExti8: case X86::MOV64ImmSExti8: - return ExpandMOVImmSExti8(MIB); + return ExpandMOVImmSExti8(MIB, *this, Subtarget); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -5663,6 +6769,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr)); case X86::AVX512_512_SET0: return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); + case X86::AVX512_FsFLD0SS: + case X86::AVX512_FsFLD0SD: + return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: @@ -5676,6 +6785,45 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(Reg, RegState::Undef).addImm(0xff); return true; } + case X86::AVX512_512_SEXT_MASK_32: + case X86::AVX512_512_SEXT_MASK_64: { + unsigned Reg = MIB->getOperand(0).getReg(); + unsigned MaskReg = MIB->getOperand(1).getReg(); + unsigned MaskState = getRegState(MIB->getOperand(1)); + unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? + X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; + MI.RemoveOperand(1); + MIB->setDesc(get(Opc)); + // VPTERNLOG needs 3 register inputs and an immediate. + // 0xff will return 1s for any input. + MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState) + .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff); + return true; + } + case X86::VMOVAPSZ128rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm), + get(X86::VBROADCASTF32X4rm), X86::sub_xmm); + case X86::VMOVUPSZ128rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm), + get(X86::VBROADCASTF32X4rm), X86::sub_xmm); + case X86::VMOVAPSZ256rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm), + get(X86::VBROADCASTF64X4rm), X86::sub_ymm); + case X86::VMOVUPSZ256rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm), + get(X86::VBROADCASTF64X4rm), X86::sub_ymm); + case X86::VMOVAPSZ128mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr), + get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); + case X86::VMOVUPSZ128mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr), + get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); + case X86::VMOVAPSZ256mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr), + get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); + case X86::VMOVUPSZ256mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), + get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); case X86::TEST8ri_NOREX: MI.setDesc(get(X86::TEST8ri)); return true; @@ -5801,6 +6949,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( switch (MI.getOpcode()) { case X86::INSERTPSrr: case X86::VINSERTPSrr: + case X86::VINSERTPSZrr: // Attempt to convert the load of inserted vector into a fold load // of a single float. if (OpNum == 2) { @@ -5814,8 +6963,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = - (MI.getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm - : X86::INSERTPSrm); + (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm : + (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm : + X86::INSERTPSrm; MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); @@ -5825,6 +6975,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( break; case X86::MOVHLPSrr: case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: // Move the upper 64-bits of the second operand to the lower 64-bits. // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. @@ -5832,8 +6983,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); if (Size <= RCSize && 8 <= Align) { unsigned NewOpCode = - (MI.getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm - : X86::MOVLPSrm); + (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : + (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : + X86::MOVLPSrm; MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); return NewMI; @@ -6042,12 +7194,8 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::CVTSI2SD64rm: case X86::CVTSD2SSrr: case X86::CVTSD2SSrm: - case X86::Int_CVTSD2SSrr: - case X86::Int_CVTSD2SSrm: case X86::CVTSS2SDrr: case X86::CVTSS2SDrm: - case X86::Int_CVTSS2SDrr: - case X86::Int_CVTSS2SDrm: case X86::MOVHPDrm: case X86::MOVHPSrm: case X86::MOVLPDrm: @@ -6058,10 +7206,8 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::RCPSSm_Int: case X86::ROUNDSDr: case X86::ROUNDSDm: - case X86::ROUNDSDr_Int: case X86::ROUNDSSr: case X86::ROUNDSSm: - case X86::ROUNDSSr_Int: case X86::RSQRTSSr: case X86::RSQRTSSm: case X86::RSQRTSSr_Int: @@ -6134,28 +7280,95 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::Int_VCVTSS2SDrr: case X86::Int_VCVTSS2SDrm: case X86::VRCPSSr: + case X86::VRCPSSr_Int: case X86::VRCPSSm: case X86::VRCPSSm_Int: case X86::VROUNDSDr: case X86::VROUNDSDm: case X86::VROUNDSDr_Int: + case X86::VROUNDSDm_Int: case X86::VROUNDSSr: case X86::VROUNDSSm: case X86::VROUNDSSr_Int: + case X86::VROUNDSSm_Int: case X86::VRSQRTSSr: + case X86::VRSQRTSSr_Int: case X86::VRSQRTSSm: case X86::VRSQRTSSm_Int: case X86::VSQRTSSr: + case X86::VSQRTSSr_Int: case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSDr: + case X86::VSQRTSDr_Int: case X86::VSQRTSDm: case X86::VSQRTSDm_Int: - // AVX-512 + // AVX-512 + case X86::VCVTSI2SSZrr: + case X86::VCVTSI2SSZrm: + case X86::VCVTSI2SSZrr_Int: + case X86::VCVTSI2SSZrrb_Int: + case X86::VCVTSI2SSZrm_Int: + case X86::VCVTSI642SSZrr: + case X86::VCVTSI642SSZrm: + case X86::VCVTSI642SSZrr_Int: + case X86::VCVTSI642SSZrrb_Int: + case X86::VCVTSI642SSZrm_Int: + case X86::VCVTSI2SDZrr: + case X86::VCVTSI2SDZrm: + case X86::VCVTSI2SDZrr_Int: + case X86::VCVTSI2SDZrrb_Int: + case X86::VCVTSI2SDZrm_Int: + case X86::VCVTSI642SDZrr: + case X86::VCVTSI642SDZrm: + case X86::VCVTSI642SDZrr_Int: + case X86::VCVTSI642SDZrrb_Int: + case X86::VCVTSI642SDZrm_Int: + case X86::VCVTUSI2SSZrr: + case X86::VCVTUSI2SSZrm: + case X86::VCVTUSI2SSZrr_Int: + case X86::VCVTUSI2SSZrrb_Int: + case X86::VCVTUSI2SSZrm_Int: + case X86::VCVTUSI642SSZrr: + case X86::VCVTUSI642SSZrm: + case X86::VCVTUSI642SSZrr_Int: + case X86::VCVTUSI642SSZrrb_Int: + case X86::VCVTUSI642SSZrm_Int: + case X86::VCVTUSI2SDZrr: + case X86::VCVTUSI2SDZrm: + case X86::VCVTUSI2SDZrr_Int: + case X86::VCVTUSI2SDZrm_Int: + case X86::VCVTUSI642SDZrr: + case X86::VCVTUSI642SDZrm: + case X86::VCVTUSI642SDZrr_Int: + case X86::VCVTUSI642SDZrrb_Int: + case X86::VCVTUSI642SDZrm_Int: case X86::VCVTSD2SSZrr: + case X86::VCVTSD2SSZrrb: case X86::VCVTSD2SSZrm: case X86::VCVTSS2SDZrr: + case X86::VCVTSS2SDZrrb: case X86::VCVTSS2SDZrm: + case X86::VRNDSCALESDr: + case X86::VRNDSCALESDrb: + case X86::VRNDSCALESDm: + case X86::VRNDSCALESSr: + case X86::VRNDSCALESSrb: + case X86::VRNDSCALESSm: + case X86::VRCP14SSrr: + case X86::VRCP14SSrm: + case X86::VRSQRT14SSrr: + case X86::VRSQRT14SSrm: + case X86::VSQRTSSZr: + case X86::VSQRTSSZr_Int: + case X86::VSQRTSSZrb_Int: + case X86::VSQRTSSZm: + case X86::VSQRTSSZm_Int: + case X86::VSQRTSDZr: + case X86::VSQRTSDZr_Int: + case X86::VSQRTSDZrb_Int: + case X86::VSQRTSDZm: + case X86::VSQRTSDZm_Int: return true; } @@ -6233,9 +7446,17 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) return nullptr; - const MachineFrameInfo *MFI = MF.getFrameInfo(); - unsigned Size = MFI->getObjectSize(FrameIndex); - unsigned Alignment = MFI->getObjectAlignment(FrameIndex); + // Don't fold subreg spills, or reloads that use a high subreg. + for (auto Op : Ops) { + MachineOperand &MO = MI.getOperand(Op); + auto SubReg = MO.getSubReg(); + if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi)) + return nullptr; + } + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + unsigned Size = MFI.getObjectSize(FrameIndex); + unsigned Alignment = MFI.getObjectAlignment(FrameIndex); // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) @@ -6295,15 +7516,26 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, // instruction isn't scalar (SS). switch (UserOpc) { case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int: + case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int: case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int: + case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int: + case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: - case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int: - case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int: - case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int: - case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int: - case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int: - case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int: + case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int: + case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int: + case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int: + case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int: + case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int: + case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int: + case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int: + case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int: + case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int: + case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int: + case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int: + case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int: + case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int: + case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int: return false; default: return true; @@ -6317,15 +7549,26 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, // instruction isn't scalar (SD). switch (UserOpc) { case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int: + case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int: case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int: + case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int: + case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: - case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int: - case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int: - case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int: - case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int: - case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int: - case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int: + case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int: + case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int: + case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int: + case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int: + case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int: + case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int: + case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int: + case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int: + case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int: + case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int: + case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int: + case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int: + case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int: + case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int: return false; default: return true; @@ -6339,6 +7582,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, LiveIntervals *LIS) const { + + // TODO: Support the case where LoadMI loads a wide register, but MI + // only uses a subreg. + for (auto Op : Ops) { + if (MI.getOperand(Op).getSubReg()) + return nullptr; + } + // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI.getDesc().getNumOperands(); int FrameIndex; @@ -6376,9 +7627,11 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Alignment = 16; break; case X86::FsFLD0SD: + case X86::AVX512_FsFLD0SD: Alignment = 8; break; case X86::FsFLD0SS: + case X86::AVX512_FsFLD0SS: Alignment = 4; break; default: @@ -6415,7 +7668,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: case X86::FsFLD0SD: - case X86::FsFLD0SS: { + case X86::AVX512_FsFLD0SD: + case X86::FsFLD0SS: + case X86::AVX512_FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. @@ -6441,9 +7696,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; unsigned Opc = LoadMI.getOpcode(); - if (Opc == X86::FsFLD0SS) + if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction()->getContext()); - else if (Opc == X86::FsFLD0SD) + else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); @@ -6649,7 +7904,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, @@ -6694,7 +7949,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; SDNode *Store = @@ -6746,8 +8001,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVSDrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: - case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: @@ -6757,8 +8010,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, // AVX load instructions case X86::VMOVSSrm: case X86::VMOVSDrm: - case X86::FsVMOVAPSrm: - case X86::FsVMOVAPDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -6776,6 +8027,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVSDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: @@ -6786,6 +8039,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVDQU64Z128rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: + case X86::VMOVAPSZ256rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: @@ -6823,8 +8078,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVSDrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: - case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: @@ -6834,8 +8087,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, // AVX load instructions case X86::VMOVSSrm: case X86::VMOVSDrm: - case X86::FsVMOVAPSrm: - case X86::FsVMOVAPDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: @@ -6853,6 +8104,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVSDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: @@ -6863,6 +8116,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVDQU64Z128rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: + case X86::VMOVAPSZ256rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: @@ -6960,8 +8215,8 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } -bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First, - MachineInstr &Second) const { +bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First, + const MachineInstr &Second) const { // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. @@ -7120,7 +8375,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First, } bool X86InstrInfo:: -ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { +reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm()); Cond[0].setImm(GetOppositeBranchCondition(CC)); @@ -7168,7 +8423,10 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, - { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, + { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, + { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr }, + { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm }, + { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, @@ -7184,7 +8442,10 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, - { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr }, + { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr }, + { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr }, + { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm }, + { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm }, { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, @@ -7200,7 +8461,26 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, - { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr } + { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }, + // AVX512 support + { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr }, + { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, + { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, + { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr }, + { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr }, + { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr }, + { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm }, + { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm }, + { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r }, + { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m }, + { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r }, + { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m }, + { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr }, + { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm }, + { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r }, + { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m }, + { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr }, + { X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm }, }; static const uint16_t ReplaceableInstrsAVX2[][3] = { @@ -7224,22 +8504,257 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr}, { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm}, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, - { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm} + { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}, + { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 }, +}; + +static const uint16_t ReplaceableInstrsAVX512[][4] = { + // Two integer columns for 64-bit and 32-bit elements. + //PackedSingle PackedDouble PackedInt PackedInt + { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr }, + { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm }, + { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr }, + { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr }, + { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm }, + { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr }, + { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm }, + { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr }, + { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr }, + { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm }, + { X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr }, + { X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm }, + { X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr }, + { X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr }, + { X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm }, +}; + +static const uint16_t ReplaceableInstrsAVX512DQ[][4] = { + // Two integer columns for 64-bit and 32-bit elements. + //PackedSingle PackedDouble PackedInt PackedInt + { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm }, + { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr }, + { X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm }, + { X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr }, + { X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm }, + { X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr }, + { X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm }, + { X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr }, + { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm }, + { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr }, + { X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm }, + { X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr }, + { X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm }, + { X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr }, + { X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm }, + { X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr }, + { X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm }, + { X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr }, + { X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm }, + { X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr }, + { X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm }, + { X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr }, + { X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm }, + { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr }, +}; + +static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = { + // Two integer columns for 64-bit and 32-bit elements. + //PackedSingle PackedDouble + //PackedInt PackedInt + { X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk, + X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk }, + { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz, + X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz }, + { X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk, + X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk }, + { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz, + X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz }, + { X86::VANDPSZ128rmk, X86::VANDPDZ128rmk, + X86::VPANDQZ128rmk, X86::VPANDDZ128rmk }, + { X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz, + X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz }, + { X86::VANDPSZ128rrk, X86::VANDPDZ128rrk, + X86::VPANDQZ128rrk, X86::VPANDDZ128rrk }, + { X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz, + X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz }, + { X86::VORPSZ128rmk, X86::VORPDZ128rmk, + X86::VPORQZ128rmk, X86::VPORDZ128rmk }, + { X86::VORPSZ128rmkz, X86::VORPDZ128rmkz, + X86::VPORQZ128rmkz, X86::VPORDZ128rmkz }, + { X86::VORPSZ128rrk, X86::VORPDZ128rrk, + X86::VPORQZ128rrk, X86::VPORDZ128rrk }, + { X86::VORPSZ128rrkz, X86::VORPDZ128rrkz, + X86::VPORQZ128rrkz, X86::VPORDZ128rrkz }, + { X86::VXORPSZ128rmk, X86::VXORPDZ128rmk, + X86::VPXORQZ128rmk, X86::VPXORDZ128rmk }, + { X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz, + X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz }, + { X86::VXORPSZ128rrk, X86::VXORPDZ128rrk, + X86::VPXORQZ128rrk, X86::VPXORDZ128rrk }, + { X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz, + X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz }, + { X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk, + X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk }, + { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz, + X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz }, + { X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk, + X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk }, + { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz, + X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz }, + { X86::VANDPSZ256rmk, X86::VANDPDZ256rmk, + X86::VPANDQZ256rmk, X86::VPANDDZ256rmk }, + { X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz, + X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz }, + { X86::VANDPSZ256rrk, X86::VANDPDZ256rrk, + X86::VPANDQZ256rrk, X86::VPANDDZ256rrk }, + { X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz, + X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz }, + { X86::VORPSZ256rmk, X86::VORPDZ256rmk, + X86::VPORQZ256rmk, X86::VPORDZ256rmk }, + { X86::VORPSZ256rmkz, X86::VORPDZ256rmkz, + X86::VPORQZ256rmkz, X86::VPORDZ256rmkz }, + { X86::VORPSZ256rrk, X86::VORPDZ256rrk, + X86::VPORQZ256rrk, X86::VPORDZ256rrk }, + { X86::VORPSZ256rrkz, X86::VORPDZ256rrkz, + X86::VPORQZ256rrkz, X86::VPORDZ256rrkz }, + { X86::VXORPSZ256rmk, X86::VXORPDZ256rmk, + X86::VPXORQZ256rmk, X86::VPXORDZ256rmk }, + { X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz, + X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz }, + { X86::VXORPSZ256rrk, X86::VXORPDZ256rrk, + X86::VPXORQZ256rrk, X86::VPXORDZ256rrk }, + { X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz, + X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz }, + { X86::VANDNPSZrmk, X86::VANDNPDZrmk, + X86::VPANDNQZrmk, X86::VPANDNDZrmk }, + { X86::VANDNPSZrmkz, X86::VANDNPDZrmkz, + X86::VPANDNQZrmkz, X86::VPANDNDZrmkz }, + { X86::VANDNPSZrrk, X86::VANDNPDZrrk, + X86::VPANDNQZrrk, X86::VPANDNDZrrk }, + { X86::VANDNPSZrrkz, X86::VANDNPDZrrkz, + X86::VPANDNQZrrkz, X86::VPANDNDZrrkz }, + { X86::VANDPSZrmk, X86::VANDPDZrmk, + X86::VPANDQZrmk, X86::VPANDDZrmk }, + { X86::VANDPSZrmkz, X86::VANDPDZrmkz, + X86::VPANDQZrmkz, X86::VPANDDZrmkz }, + { X86::VANDPSZrrk, X86::VANDPDZrrk, + X86::VPANDQZrrk, X86::VPANDDZrrk }, + { X86::VANDPSZrrkz, X86::VANDPDZrrkz, + X86::VPANDQZrrkz, X86::VPANDDZrrkz }, + { X86::VORPSZrmk, X86::VORPDZrmk, + X86::VPORQZrmk, X86::VPORDZrmk }, + { X86::VORPSZrmkz, X86::VORPDZrmkz, + X86::VPORQZrmkz, X86::VPORDZrmkz }, + { X86::VORPSZrrk, X86::VORPDZrrk, + X86::VPORQZrrk, X86::VPORDZrrk }, + { X86::VORPSZrrkz, X86::VORPDZrrkz, + X86::VPORQZrrkz, X86::VPORDZrrkz }, + { X86::VXORPSZrmk, X86::VXORPDZrmk, + X86::VPXORQZrmk, X86::VPXORDZrmk }, + { X86::VXORPSZrmkz, X86::VXORPDZrmkz, + X86::VPXORQZrmkz, X86::VPXORDZrmkz }, + { X86::VXORPSZrrk, X86::VXORPDZrrk, + X86::VPXORQZrrk, X86::VPXORDZrrk }, + { X86::VXORPSZrrkz, X86::VXORPDZrrkz, + X86::VPXORQZrrkz, X86::VPXORDZrrkz }, + // Broadcast loads can be handled the same as masked operations to avoid + // changing element size. + { X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb, + X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb }, + { X86::VANDPSZ128rmb, X86::VANDPDZ128rmb, + X86::VPANDQZ128rmb, X86::VPANDDZ128rmb }, + { X86::VORPSZ128rmb, X86::VORPDZ128rmb, + X86::VPORQZ128rmb, X86::VPORDZ128rmb }, + { X86::VXORPSZ128rmb, X86::VXORPDZ128rmb, + X86::VPXORQZ128rmb, X86::VPXORDZ128rmb }, + { X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb, + X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb }, + { X86::VANDPSZ256rmb, X86::VANDPDZ256rmb, + X86::VPANDQZ256rmb, X86::VPANDDZ256rmb }, + { X86::VORPSZ256rmb, X86::VORPDZ256rmb, + X86::VPORQZ256rmb, X86::VPORDZ256rmb }, + { X86::VXORPSZ256rmb, X86::VXORPDZ256rmb, + X86::VPXORQZ256rmb, X86::VPXORDZ256rmb }, + { X86::VANDNPSZrmb, X86::VANDNPDZrmb, + X86::VPANDNQZrmb, X86::VPANDNDZrmb }, + { X86::VANDPSZrmb, X86::VANDPDZrmb, + X86::VPANDQZrmb, X86::VPANDDZrmb }, + { X86::VANDPSZrmb, X86::VANDPDZrmb, + X86::VPANDQZrmb, X86::VPANDDZrmb }, + { X86::VORPSZrmb, X86::VORPDZrmb, + X86::VPORQZrmb, X86::VPORDZrmb }, + { X86::VXORPSZrmb, X86::VXORPDZrmb, + X86::VPXORQZrmb, X86::VPXORDZrmb }, + { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk, + X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk }, + { X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk, + X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk }, + { X86::VORPSZ128rmbk, X86::VORPDZ128rmbk, + X86::VPORQZ128rmbk, X86::VPORDZ128rmbk }, + { X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk, + X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk }, + { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk, + X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk }, + { X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk, + X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk }, + { X86::VORPSZ256rmbk, X86::VORPDZ256rmbk, + X86::VPORQZ256rmbk, X86::VPORDZ256rmbk }, + { X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk, + X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk }, + { X86::VANDNPSZrmbk, X86::VANDNPDZrmbk, + X86::VPANDNQZrmbk, X86::VPANDNDZrmbk }, + { X86::VANDPSZrmbk, X86::VANDPDZrmbk, + X86::VPANDQZrmbk, X86::VPANDDZrmbk }, + { X86::VANDPSZrmbk, X86::VANDPDZrmbk, + X86::VPANDQZrmbk, X86::VPANDDZrmbk }, + { X86::VORPSZrmbk, X86::VORPDZrmbk, + X86::VPORQZrmbk, X86::VPORDZrmbk }, + { X86::VXORPSZrmbk, X86::VXORPDZrmbk, + X86::VPXORQZrmbk, X86::VPXORDZrmbk }, + { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz, + X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz}, + { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz, + X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz }, + { X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz, + X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz }, + { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz, + X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz }, + { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz, + X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz}, + { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz, + X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz }, + { X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz, + X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz }, + { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz, + X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz }, + { X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz, + X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz }, + { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz, + X86::VPANDQZrmbkz, X86::VPANDDZrmbkz }, + { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz, + X86::VPANDQZrmbkz, X86::VPANDDZrmbkz }, + { X86::VORPSZrmbkz, X86::VORPDZrmbkz, + X86::VPORQZrmbkz, X86::VPORDZrmbkz }, + { X86::VXORPSZrmbkz, X86::VXORPDZrmbkz, + X86::VPXORQZrmbkz, X86::VPXORDZrmbkz }, }; // FIXME: Some shuffle and unpack instructions have equivalents in different // domains, but they require a bit more work than just switching opcodes. -static const uint16_t *lookup(unsigned opcode, unsigned domain) { - for (const uint16_t (&Row)[3] : ReplaceableInstrs) +static const uint16_t *lookup(unsigned opcode, unsigned domain, + ArrayRef<uint16_t[3]> Table) { + for (const uint16_t (&Row)[3] : Table) if (Row[domain-1] == opcode) return Row; return nullptr; } -static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { - for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) - if (Row[domain-1] == opcode) +static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain, + ArrayRef<uint16_t[4]> Table) { + // If this is the integer domain make sure to check both integer columns. + for (const uint16_t (&Row)[4] : Table) + if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode)) return Row; return nullptr; } @@ -7247,12 +8762,25 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { std::pair<uint16_t, uint16_t> X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; - bool hasAVX2 = Subtarget.hasAVX2(); + unsigned opcode = MI.getOpcode(); uint16_t validDomains = 0; - if (domain && lookup(MI.getOpcode(), domain)) - validDomains = 0xe; - else if (domain && lookupAVX2(MI.getOpcode(), domain)) - validDomains = hasAVX2 ? 0xe : 0x6; + if (domain) { + if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) { + validDomains = 0xe; + } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) { + validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; + } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) { + validDomains = 0xe; + } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) { + validDomains = Subtarget.hasDQI() ? 0xe : 0x8; + } else if (const uint16_t *table = lookupAVX512(opcode, domain, + ReplaceableInstrsAVX512DQMasked)) { + if (domain == 1 || (domain == 3 && table[3] == opcode)) + validDomains = Subtarget.hasDQI() ? 0xa : 0x8; + else + validDomains = Subtarget.hasDQI() ? 0xc : 0x8; + } + } return std::make_pair(domain, validDomains); } @@ -7260,11 +8788,32 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); - const uint16_t *table = lookup(MI.getOpcode(), dom); + const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs); if (!table) { // try the other table assert((Subtarget.hasAVX2() || Domain < 3) && "256-bit vector operations only available in AVX2"); - table = lookupAVX2(MI.getOpcode(), dom); + table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2); + } + if (!table) { // try the AVX512 table + assert(Subtarget.hasAVX512() && "Requires AVX-512"); + table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512); + // Don't change integer Q instructions to D instructions. + if (table && Domain == 3 && table[3] == MI.getOpcode()) + Domain = 4; + } + if (!table) { // try the AVX512DQ table + assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); + table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ); + // Don't change integer Q instructions to D instructions and + // use D intructions if we started with a PS instruction. + if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) + Domain = 4; + } + if (!table) { // try the AVX512DQMasked table + assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); + table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked); + if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) + Domain = 4; } assert(table && "Cannot change domain"); MI.setDesc(get(table[Domain - 1])); @@ -7275,32 +8824,6 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } -// This code must remain in sync with getJumpInstrTableEntryBound in this class! -// In particular, getJumpInstrTableEntryBound must always return an upper bound -// on the encoding lengths of the instructions generated by -// getUnconditionalBranch and getTrap. -void X86InstrInfo::getUnconditionalBranch( - MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { - Branch.setOpcode(X86::JMP_1); - Branch.addOperand(MCOperand::createExpr(BranchTarget)); -} - -// This code must remain in sync with getJumpInstrTableEntryBound in this class! -// In particular, getJumpInstrTableEntryBound must always return an upper bound -// on the encoding lengths of the instructions generated by -// getUnconditionalBranch and getTrap. -void X86InstrInfo::getTrap(MCInst &MI) const { - MI.setOpcode(X86::TRAP); -} - -// See getTrap and getUnconditionalBranch for conditions on the value returned -// by this function. -unsigned X86InstrInfo::getJumpInstrTableEntryBound() const { - // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4 - // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B). - return 5; -} - bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; @@ -7934,6 +9457,28 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { return makeArrayRef(TargetFlags); } +bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case X86::TCRETURNdi: + case X86::TCRETURNmi: + case X86::TCRETURNri: + case X86::TCRETURNdi64: + case X86::TCRETURNmi64: + case X86::TCRETURNri64: + case X86::TAILJMPd: + case X86::TAILJMPm: + case X86::TAILJMPr: + case X86::TAILJMPd64: + case X86::TAILJMPm64: + case X86::TAILJMPr64: + case X86::TAILJMPm64_REX: + case X86::TAILJMPr64_REX: + return true; + default: + return false; + } +} + namespace { /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. @@ -7991,7 +9536,7 @@ namespace { return true; } - const char *getPassName() const override { + StringRef getPassName() const override { return "X86 PIC Global Base Reg Initialization"; } @@ -8105,7 +9650,7 @@ namespace { return Copy; } - const char *getPassName() const override { + StringRef getPassName() const override { return "Local Dynamic TLS Access Clean-up"; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index a8a9f62..acfdef4 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_X86_X86INSTRINFO_H #include "MCTargetDesc/X86BaseInfo.h" +#include "X86InstrFMA3Info.h" #include "X86RegisterInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Target/TargetInstrInfo.h" @@ -265,7 +266,7 @@ public: unsigned &SrcOpIdx2) const override; /// Returns true if the routine could find two commutable operands - /// in the given FMA instruction. Otherwise, returns false. + /// in the given FMA instruction \p MI. Otherwise, returns false. /// /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments. /// The output indices of the commuted operands are returned in these @@ -274,10 +275,12 @@ public: /// value 'CommuteAnyOperandIndex' which means that the corresponding /// operand index is not set and this method is free to pick any of /// available commutable operands. + /// The parameter \p FMA3Group keeps the reference to the group of relative + /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes. /// /// For example, calling this method this way: /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex; - /// findFMA3CommutedOpIndices(MI, Idx1, Idx2); + /// findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group); /// can be interpreted as a query asking if the operand #1 can be swapped /// with any other available operand (e.g. operand #2, operand #3, etc.). /// @@ -286,21 +289,30 @@ public: /// FMA213 #1, #2, #3 /// results into instruction with adjusted opcode: /// FMA231 #3, #2, #1 - bool findFMA3CommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const; + bool findFMA3CommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const; /// Returns an adjusted FMA opcode that must be used in FMA instruction that - /// performs the same computations as the given MI but which has the operands - /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted. + /// performs the same computations as the given \p MI but which has the + /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted. /// It may return 0 if it is unsafe to commute the operands. + /// Note that a machine instruction (instead of its opcode) is passed as the + /// first parameter to make it possible to analyze the instruction's uses and + /// commute the first operand of FMA even when it seems unsafe when you look + /// at the opcode. For example, it is Ok to commute the first operand of + /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used. /// /// The returned FMA opcode may differ from the opcode in the given \p MI. /// For example, commuting the operands #1 and #3 in the following FMA /// FMA213 #1, #2, #3 /// results into instruction with adjusted opcode: /// FMA231 #3, #2, #1 - unsigned getFMA3OpcodeToCommuteOperands(MachineInstr &MI, unsigned SrcOpIdx1, - unsigned SrcOpIdx2) const; + unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const; // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr &MI) const override; @@ -316,10 +328,12 @@ public: TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify = false) const override; - unsigned RemoveBranch(MachineBasicBlock &MBB) const override; - unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + unsigned removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved = nullptr) const override; + unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - const DebugLoc &DL) const override; + const DebugLoc &DL, + int *BytesAdded = nullptr) const override; bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond, unsigned, unsigned, int&, int&, int&) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -357,6 +371,10 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; + /// Check whether the target can fold a load that feeds a subreg operand + /// (or a subreg operand that feeds a store). + bool isSubregFoldable() const override { return true; } + /// foldMemoryOperand - If this target supports it, fold a load or store of /// the specified stack slot into the specified machine instruction for the /// specified operand(s). If this is possible, the target should perform the @@ -418,13 +436,13 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(MachineInstr &First, - MachineInstr &Second) const override; + bool shouldScheduleAdjacent(const MachineInstr &First, + const MachineInstr &Second) const override; void getNoopForMachoTarget(MCInst &NopInst) const override; bool - ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine /// instruction that defines the specified register class. @@ -467,14 +485,6 @@ public: unsigned Size, unsigned Alignment, bool AllowCommute) const; - void - getUnconditionalBranch(MCInst &Branch, - const MCSymbolRefExpr *BranchTarget) const override; - - void getTrap(MCInst &MI) const override; - - unsigned getJumpInstrTableEntryBound() const override; - bool isHighLatencyDef(int opc) const override; bool hasHighOperandLatency(const TargetSchedModel &SchedModel, @@ -529,6 +539,8 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableDirectMachineOperandTargetFlags() const override; + bool isTailCall(const MachineInstr &Inst) const override; + protected: /// Commutes the operands in the given instruction by changing the operands /// order and/or changing the instruction's opcode and/or the immediate value @@ -564,8 +576,24 @@ private: bool isFrameOperand(const MachineInstr &MI, unsigned int Op, int &FrameIndex) const; - /// Expand the MOVImmSExti8 pseudo-instructions. - bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const; + /// Returns true iff the routine could find two commutable operands in the + /// given machine instruction with 3 vector inputs. + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their + /// input values can be re-defined in this method only if the input values + /// are not pre-defined, which is designated by the special value + /// 'CommuteAnyOperandIndex' assigned to it. + /// If both of indices are pre-defined and refer to some operands, then the + /// method simply returns true if the corresponding operands are commutable + /// and returns false otherwise. + /// + /// For example, calling this method this way: + /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex; + /// findThreeSrcCommutedOpIndices(MI, Op1, Op2); + /// can be interpreted as a query asking to find an operand that would be + /// commutable with the operand#1. + bool findThreeSrcCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index b19a8f3..3803671 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -765,6 +765,12 @@ def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr", def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>; +// A relocatable immediate is either an immediate operand or an operand that can +// be relocated by the linker to an immediate, such as a regular symbol in +// non-PIC code. +def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [], + 0>; + //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. def TruePredicate : Predicate<"true">; @@ -832,6 +838,7 @@ def HasTBM : Predicate<"Subtarget->hasTBM()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; def HasF16C : Predicate<"Subtarget->hasF16C()">; +def NoF16C : Predicate<"!Subtarget->hasF16C()">; def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; @@ -876,8 +883,6 @@ def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; -def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&" - "TM.getCodeModel() != CodeModel::Kernel">; def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" "TM.getCodeModel() == CodeModel::Kernel">; def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; @@ -889,6 +894,7 @@ def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; +def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// @@ -923,6 +929,7 @@ def X86_COND_S : PatLeaf<(i8 15)>; def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; +def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; // If we have multiple users of an immediate, it's much smaller to reuse // the register, rather than encode the immediate in every instruction. @@ -941,13 +948,16 @@ def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; // Eventually, it would be nice to allow ConstantHoisting to merge constants // globally for potentially added savings. // -def imm8_su : PatLeaf<(i8 imm), [{ +def imm8_su : PatLeaf<(i8 relocImm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm16_su : PatLeaf<(i16 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def imm16_su : PatLeaf<(i16 imm), [{ +def imm32_su : PatLeaf<(i32 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def imm32_su : PatLeaf<(i32 imm), [{ +def i64immSExt32_su : PatLeaf<(i64immSExt32), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; @@ -957,10 +967,9 @@ def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; - - -def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; - +def i64immSExt8_su : PatLeaf<(i64immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit // unsigned field. @@ -1375,7 +1384,7 @@ def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16; def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, imm:$src)], IIC_MOV>, OpSize32; + [(set GR32:$dst, relocImm:$src)], IIC_MOV>, OpSize32; def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>; @@ -1383,7 +1392,7 @@ def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), let isReMaterializable = 1 in { def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), "movabs{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, imm:$src)], IIC_MOV>; + [(set GR64:$dst, relocImm:$src)], IIC_MOV>; } // Longer forms that use a ModR/M byte. Needed for disassembler @@ -1409,7 +1418,7 @@ def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; + [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>; } // SchedRW let hasSideEffects = 0 in { @@ -2251,14 +2260,14 @@ let Predicates = [HasBMI] in { multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, PatFrag ld_frag> { - def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>, - T8PS, VEX_4VOp3; - def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + T8PS, VEX; + def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)), - (implicit EFLAGS)]>, T8PS, VEX_4VOp3; + (implicit EFLAGS)]>, T8PS, VEX; } let Predicates = [HasBMI], Defs = [EFLAGS] in { @@ -2626,6 +2635,12 @@ def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>; +// Apply 'ret' behavior to 'retn' +def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"retn", "ret", "intel">; + def : MnemonicAlias<"sal", "shl", "intel">; def : MnemonicAlias<"salb", "shlb", "att">; def : MnemonicAlias<"salw", "shlw", "att">; diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td index 8d70691..0bb1068 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -150,8 +150,9 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, /// Binary MMX instructions requiring SSSE3. let ImmT = NoImm, Constraints = "$src1 = $dst" in { multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, - Intrinsic IntId64, OpndItins itins> { - let isCommutable = 0 in + Intrinsic IntId64, OpndItins itins, + bit Commutable = 0> { + let isCommutable = Commutable in def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), @@ -418,9 +419,9 @@ defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, let Predicates = [HasSSE2] in defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, MMX_PMUL_ITINS, 1>; -let isCommutable = 1 in defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", - int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>; + int_x86_ssse3_pmul_hr_sw, + MMX_PMUL_ITINS, 1>; // -- Miscellanea defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index f91764a..1812d01 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, InstrItinClass ri = arg_ri; } - // scalar let Sched = WriteFAdd in { def SSE_ALU_F32S : OpndItins< @@ -259,26 +258,24 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, } /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class -multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, - string asm, string SSEVer, string FPSizeStr, - Operand memopr, ComplexPattern mem_cpat, - Domain d, OpndItins itins, bit Is2Addr = 1> { -let isCodeGenOnly = 1 in { +multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, + SDPatternOperator Int, RegisterClass RC, + string asm, Operand memopr, + ComplexPattern mem_cpat, Domain d, + OpndItins itins, bit Is2Addr = 1> { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (!cast<Intrinsic>( - !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, RC:$src2))], itins.rr, d>, + [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>, Sched<[itins.Sched]>; + let mayLoad = 1 in def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", - SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, mem_cpat:$src2))], itins.rm, d>, + [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -372,13 +369,9 @@ def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)), // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32:$src)), (COPY_TO_REGCLASS FR32:$src, VR128)>; -def : Pat<(v8f32 (scalar_to_vector FR32:$src)), - (COPY_TO_REGCLASS FR32:$src, VR128)>; // Implicitly promote a 64-bit scalar to a vector. def : Pat<(v2f64 (scalar_to_vector FR64:$src)), (COPY_TO_REGCLASS FR64:$src, VR128)>; -def : Pat<(v4f64 (scalar_to_vector FR64:$src)), - (COPY_TO_REGCLASS FR64:$src, VR128)>; // Bitcasts between 128-bit vector types. Return the original type since // no instruction is needed for the conversion @@ -453,9 +446,9 @@ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", - [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>; + [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>; + [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>; } //===----------------------------------------------------------------------===// @@ -512,6 +505,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, string base_opc, string asm_opr, Domain d = GenericDomain> { + let isCommutable = 1 in def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), !strconcat(base_opc, asm_opr), @@ -590,6 +584,8 @@ let Predicates = [UseAVX] in { (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; // MOVSDrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -609,6 +605,8 @@ let Predicates = [UseAVX] in { def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; @@ -697,6 +695,8 @@ let Predicates = [UseSSE1] in { (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; } // Extract and store. @@ -771,13 +771,12 @@ def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d, - OpndItins itins, - bit IsReMaterializable = 1> { + OpndItins itins> { let hasSideEffects = 0 in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, Sched<[WriteFShuffle]>; -let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in +let canFoldAsLoad = 1, isReMaterializable = 1 in def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>, @@ -795,7 +794,7 @@ defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, PS, VEX; defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, - "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS>, PD, VEX; defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, @@ -808,7 +807,7 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, PS, VEX, VEX_L; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, - "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS>, PD, VEX, VEX_L; } @@ -825,7 +824,7 @@ defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, PD; defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, - "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS>, PD; } @@ -1028,7 +1027,7 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; } -let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { +let Predicates = [HasAVX, NoVLX] in { // 128-bit load/store def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), (VMOVAPSmr addr:$dst, VR128:$src)>; @@ -1077,29 +1076,6 @@ let Predicates = [UseSSE1] in { (MOVUPSmr addr:$dst, VR128:$src)>; } -// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper -// bits are disregarded. FIXME: Set encoding to pseudo! -let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { -let isCodeGenOnly = 1 in { - def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), - "movaps\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (alignedloadfsf32 addr:$src))], - IIC_SSE_MOVA_P_RM>, VEX; - def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), - "movapd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (alignedloadfsf64 addr:$src))], - IIC_SSE_MOVA_P_RM>, VEX; - def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), - "movaps\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (alignedloadfsf32 addr:$src))], - IIC_SSE_MOVA_P_RM>; - def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), - "movapd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (alignedloadfsf64 addr:$src))], - IIC_SSE_MOVA_P_RM>; -} -} - //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Low packed FP Instructions //===----------------------------------------------------------------------===// @@ -1300,6 +1276,7 @@ let Predicates = [UseAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to // load the data. def : Pat<(v2f64 (X86Unpckl VR128:$src1, @@ -1307,6 +1284,11 @@ let Predicates = [UseAVX] in { (VMOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt + (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))), + (iPTR 0))), addr:$dst), + (VMOVHPDmr addr:$dst, VR128:$src)>; + + def : Pat<(store (f64 (extractelt (v2f64 (X86VPermilpi VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (VMOVHPDmr addr:$dst, VR128:$src)>; @@ -1332,6 +1314,7 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to // load the data. def : Pat<(v2f64 (X86Unpckl VR128:$src1, @@ -1339,6 +1322,11 @@ let Predicates = [UseSSE2] in { (MOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt + (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))), + (iPTR 0))), addr:$dst), + (MOVHPDmr addr:$dst, VR128:$src)>; + + def : Pat<(store (f64 (extractelt (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; @@ -1371,6 +1359,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { [(set VR128:$dst, (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; + let isCommutable = 1 in def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movhlps\t{$src2, $dst|$dst, $src2}", @@ -1449,15 +1438,18 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, itins.rm>, Sched<[itins.Sched.Folded]>; } -multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - X86MemOperand x86memop, string asm, Domain d, - OpndItins itins> { +multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, + ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, + string asm, Domain d, OpndItins itins> { let hasSideEffects = 0 in { - def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [], itins.rr, d>, Sched<[itins.Sched]>; + def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, + [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], + itins.rr, d>, Sched<[itins.Sched]>; let mayLoad = 1 in - def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [], itins.rm, d>, Sched<[itins.Sched.Folded]>; + def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, + [(set RC:$dst, (DstTy (sint_to_fp + (SrcTy (bitconvert (ld_frag addr:$src))))))], + itins.rm, d>, Sched<[itins.Sched.Folded]>; } } @@ -1730,16 +1722,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, ssmem, sse_load_f32, "cvtss2si", SSE_CVT_SS2SI_64>, XS, REX_W; -defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, +defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - PS, VEX, Requires<[HasAVX]>; -defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, + PS, VEX, Requires<[HasAVX, NoVLX]>; +defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - PS, VEX, VEX_L, Requires<[HasAVX]>; + PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>; -defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, +defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64, "cvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, PS, Requires<[UseSSE2]>; @@ -1798,16 +1790,16 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), Sched<[WriteCvtF2FLd, ReadAfterLd]>; } -def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, +def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, Requires<[UseAVX]>; def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (fround FR64:$src))], + [(set FR32:$dst, (fpround FR64:$src))], IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>; def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (fround (loadf64 addr:$src)))], + [(set FR32:$dst, (fpround (loadf64 addr:$src)))], IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; @@ -1864,9 +1856,9 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), Sched<[WriteCvtF2FLd, ReadAfterLd]>; } -def : Pat<(f64 (fextend FR32:$src)), +def : Pat<(f64 (fpextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>; -def : Pat<(fextend (loadf32 addr:$src)), +def : Pat<(fpextend (loadf32 addr:$src)), (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; def : Pat<(extloadf32 addr:$src), @@ -1878,7 +1870,7 @@ def : Pat<(extloadf32 addr:$src), def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (fextend FR32:$src))], + [(set FR64:$dst, (fpextend FR32:$src))], IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), @@ -1887,12 +1879,12 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; -// extload f32 -> f64. This matches load+fextend because we have a hack in +// extload f32 -> f64. This matches load+fpextend because we have a hack in // the isel (PreprocessForFPConvert) that can introduce loads after dag // combine. -// Since these loads aren't folded into the fextend, we have to match it +// Since these loads aren't folded into the fpextend, we have to match it // explicitly here. -def : Pat<(fextend (loadf32 addr:$src)), +def : Pat<(fpextend (loadf32 addr:$src)), (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>; def : Pat<(extloadf32 addr:$src), (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; @@ -1930,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, } } // isCodeGenOnly = 1 +// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and +// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary +// vmovs{s,d} instructions +let Predicates = [UseAVX] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseAVX] + +let Predicates = [UseSSE2] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseSSE2] + +let Predicates = [UseSSE1] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>; +} // Predicates = [UseSSE1] + // Convert packed single/double fp to doubleword def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", @@ -1962,134 +2027,98 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), // Convert Packed Double FP to Packed DW Integers -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, VEX, Sched<[WriteCvtF2I]>; // XMM only def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; -def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "vcvtpd2dqx\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX, - Sched<[WriteCvtF2ILd]>; +def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, + Sched<[WriteCvtF2ILd]>; +def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>; // YMM only def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", + "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L, - Sched<[WriteCvtF2I]>; + (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>, + (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; -def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", +def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; +def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", + (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>; } def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))], + (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))], IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))], IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; // Convert with truncation packed single/double fp to doubleword // SSE2 packed instructions with XS prefix +let Predicates = [HasAVX, NoVLX] in { def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttps2dq VR128:$src))], + (v4i32 (fp_to_sint (v4f32 VR128:$src))))], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttps2dq - (loadv4f32 addr:$src)))], + [(set VR128:$dst, + (v4i32 (fp_to_sint (loadv4f32 addr:$src))))], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], + (v8i32 (fp_to_sint (v8f32 VR256:$src))))], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 - (loadv8f32 addr:$src)))], + [(set VR256:$dst, + (v8i32 (fp_to_sint (loadv8f32 addr:$src))))], IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; +} def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], + [(set VR128:$dst, + (v4i32 (fp_to_sint (v4f32 VR128:$src))))], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], + (v4i32 (fp_to_sint (memopv4f32 addr:$src))))], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; -let Predicates = [HasAVX] in { - def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), - (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), - (VCVTDQ2PSrm addr:$src)>; -} - -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), - (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), - (VCVTDQ2PSrm addr:$src)>; - - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), - (VCVTTPS2DQrr VR128:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), - (VCVTTPS2DQrm addr:$src)>; - - def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))), - (VCVTDQ2PSYrr VR256:$src)>; - def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))), - (VCVTDQ2PSYrm addr:$src)>; - - def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), - (VCVTTPS2DQYrr VR256:$src)>; - def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), - (VCVTTPS2DQYrm addr:$src)>; -} - -let Predicates = [UseSSE2] in { - def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), - (CVTDQ2PSrr VR128:$src)>; - def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), - (CVTDQ2PSrm addr:$src)>; - - def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), - (CVTDQ2PSrr VR128:$src)>; - def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), - (CVTDQ2PSrm addr:$src)>; - - def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), - (CVTTPS2DQrr VR128:$src)>; - def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), - (CVTTPS2DQrm addr:$src)>; -} - +let Predicates = [HasAVX, NoVLX] in def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_sse2_cvttpd2dq VR128:$src))], - IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; + (v4i32 (X86cvttp2si (v2f64 VR128:$src))))], + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. @@ -2098,66 +2127,92 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), // XMM only def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; -def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvttpd2dqx\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (loadv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; +let Predicates = [HasAVX, NoVLX] in +def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))], + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; +def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>; // YMM only +let Predicates = [HasAVX, NoVLX] in { def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", + "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], + (v4i32 (fp_to_sint (v4f64 VR256:$src))))], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], + (v4i32 (fp_to_sint (loadv4f64 addr:$src))))], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; -def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", +} +def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; +def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), - (VCVTTPD2DQYrr VR256:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), - (VCVTTPD2DQYrm addr:$src)>; + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (VCVTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (VCVTTPD2DQrr VR128:$src)>; + } } // Predicates = [HasAVX] def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], + [(set VR128:$dst, + (v4i32 (X86cvttp2si (v2f64 VR128:$src))))], IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvttpd2dq - (memopv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, - Sched<[WriteCvtF2ILd]>; + [(set VR128:$dst, + (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))], + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; + +let Predicates = [UseSSE2] in { + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (CVTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (CVTTPD2DQrr VR128:$src)>; + } +} // Predicates = [UseSSE2] // Convert packed single to packed double -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // SSE2 instructions without OpSize prefix def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>; + [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))], + IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>; def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>; + [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))], + IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>; def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; + [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; } let Predicates = [UseSSE2] in { def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", - [], IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>; + [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))], + IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>; def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], @@ -2165,136 +2220,118 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), } // Convert Packed DW Integers to Packed Double FP -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", - []>, VEX, Sched<[WriteCvtI2FLd]>; + [(set VR128:$dst, + (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, + VEX, Sched<[WriteCvtI2FLd]>; def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", - []>, VEX, Sched<[WriteCvtI2F]>; + [(set VR128:$dst, + (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, + VEX, Sched<[WriteCvtI2F]>; def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", - []>, VEX, VEX_L, Sched<[WriteCvtI2FLd]>; + [(set VR256:$dst, + (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>, + VEX, VEX_L, Sched<[WriteCvtI2FLd]>; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", - []>, VEX, VEX_L, Sched<[WriteCvtI2F]>; + [(set VR256:$dst, + (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtI2F]>; } let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - "cvtdq2pd\t{$src, $dst|$dst, $src}", [], + "cvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))], IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtdq2pd\t{$src, $dst|$dst, $src}", [], + "cvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (X86VSintToFP (v4i32 VR128:$src))))], IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>; // AVX register conversion intrinsics -let Predicates = [HasAVX] in { - def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), - (VCVTDQ2PDrr VR128:$src)>; - def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), - (VCVTDQ2PDrm addr:$src)>; - def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (VCVTDQ2PDrm addr:$src)>; - - def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), - (VCVTDQ2PDYrr VR128:$src)>; - def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), - (VCVTDQ2PDYrm addr:$src)>; -} // Predicates = [HasAVX] +} // Predicates = [HasAVX, NoVLX] // SSE2 register conversion intrinsics -let Predicates = [HasSSE2] in { - def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), - (CVTDQ2PDrr VR128:$src)>; - def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), - (CVTDQ2PDrm addr:$src)>; - def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), +let Predicates = [UseSSE2] in { + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (CVTDQ2PDrm addr:$src)>; -} // Predicates = [HasSSE2] +} // Predicates = [UseSSE2] // Convert packed double to packed single // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. +let Predicates = [HasAVX, NoVLX] in def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], + [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))], IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>; // XMM only def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; -def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtpd2psx\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; +let Predicates = [HasAVX, NoVLX] in +def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; +def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", + (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>; // YMM only +let Predicates = [HasAVX, NoVLX] in { def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), - "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (int_x86_avx_cvt_pd2_ps_256 VR256:$src))], + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (fpround VR256:$src))], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))], + [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; -def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", +} +def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; +def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", + (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>; def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], + [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))], IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>; def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], + [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))], IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>; - // AVX 256-bit register conversion intrinsics // FIXME: Migrate SSE conversion intrinsics matching to use patterns as below // whenever possible to avoid declaring two versions of each one. -let Predicates = [HasAVX] in { - def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), - (VCVTDQ2PSYrr VR256:$src)>; - def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))), - (VCVTDQ2PSYrm addr:$src)>; -} let Predicates = [HasAVX, NoVLX] in { - // Match fround and fextend for 128/256-bit conversions - def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + // Match fpround and fpextend for 128/256-bit conversions + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), (VCVTPD2PSrr VR128:$src)>; - def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))), - (VCVTPD2PSXrm addr:$src)>; - def : Pat<(v4f32 (fround (v4f64 VR256:$src))), - (VCVTPD2PSYrr VR256:$src)>; - def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), - (VCVTPD2PSYrm addr:$src)>; - - def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), - (VCVTPS2PDrr VR128:$src)>; - def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), - (VCVTPS2PDYrr VR128:$src)>; - def : Pat<(v4f64 (extloadv4f32 addr:$src)), - (VCVTPS2PDYrm addr:$src)>; } let Predicates = [UseSSE2] in { - // Match fround and fextend for 128 conversions - def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + // Match fpround and fpextend for 128 conversions + let AddedComplexity = 15 in + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), (CVTPD2PSrr VR128:$src)>; - def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), - (CVTPD2PSrm addr:$src)>; - - def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), - (CVTPS2PDrr VR128:$src)>; } //===----------------------------------------------------------------------===// @@ -2306,6 +2343,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, Operand CC, SDNode OpNode, ValueType VT, PatFrag ld_frag, string asm, string asm_alt, OpndItins itins, ImmLeaf immLeaf> { + let isCommutable = 1 in def rr : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))], @@ -2351,9 +2389,9 @@ let Constraints = "$src1 = $dst" in { SSE_ALU_F64S, i8immZExt3>, XD; } -multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, +multiclass sse12_cmp_scalar_int<Operand memop, Operand CC, Intrinsic Int, string asm, OpndItins itins, - ImmLeaf immLeaf> { + ImmLeaf immLeaf, ComplexPattern mem_cpat> { def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, @@ -2361,30 +2399,30 @@ multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, itins.rr>, Sched<[itins.Sched]>; def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, x86memop:$src, CC:$cc), asm, + (ins VR128:$src1, memop:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - (load addr:$src), immLeaf:$cc))], + mem_cpat:$src, immLeaf:$cc))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } let isCodeGenOnly = 1 in { // Aliases to match intrinsics which expect XMM operand(s). - defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, + defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S, i8immZExt5>, + SSE_ALU_F32S, i8immZExt5, sse_load_f32>, XS, VEX_4V; - defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, + defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S, i8immZExt5>, // same latency as f32 + SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32 XD, VEX_4V; let Constraints = "$src1 = $dst" in { - defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, + defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $dst|$dst, $src}", - SSE_ALU_F32S, i8immZExt3>, XS; - defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, + SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS; + defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $dst|$dst, $src}", - SSE_ALU_F64S, i8immZExt3>, + SSE_ALU_F64S, i8immZExt3, sse_load_f64>, XD; } } @@ -2407,6 +2445,23 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, Sched<[WriteFAddLd, ReadAfterLd]>; } +// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp +multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, + ValueType vt, Operand memop, + ComplexPattern mem_cpat, string OpcodeStr> { + def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], + IIC_SSE_COMIS_RR>, + Sched<[WriteFAdd]>; + def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), + mem_cpat:$src2))], + IIC_SSE_COMIS_RM>, + Sched<[WriteFAddLd, ReadAfterLd]>; +} + let Defs = [EFLAGS] in { defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss">, PS, VEX, VEX_LIG; @@ -2420,15 +2475,15 @@ let Defs = [EFLAGS] in { } let isCodeGenOnly = 1 in { - defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss">, PS, VEX; - defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd">, PD, VEX; - - defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, - load, "comiss">, PS, VEX; - defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, - load, "comisd">, PD, VEX; + defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, + sse_load_f32, "ucomiss">, PS, VEX; + defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, + sse_load_f64, "ucomisd">, PD, VEX; + + defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, + sse_load_f32, "comiss">, PS, VEX; + defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, + sse_load_f64, "comisd">, PD, VEX; } defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss">, PS; @@ -2443,15 +2498,15 @@ let Defs = [EFLAGS] in { } let isCodeGenOnly = 1 in { - defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss">, PS; - defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd">, PD; - - defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, - "comiss">, PS; - defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, - "comisd">, PD; + defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, + sse_load_f32, "ucomiss">, PS; + defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, + sse_load_f64, "ucomisd">, PD; + + defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, + sse_load_f32, "comiss">, PS; + defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, + sse_load_f64, "comisd">, PD; } } // Defs = [EFLAGS] @@ -2641,7 +2696,8 @@ let Predicates = [UseSSE2] in { multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, PatFrag mem_frag, RegisterClass RC, X86MemOperand x86memop, string asm, - Domain d> { + Domain d, bit IsCommutable = 0> { + let isCommutable = IsCommutable in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), asm, [(set RC:$dst, @@ -2689,7 +2745,7 @@ let Constraints = "$src1 = $dst" in { SSEPackedSingle>, PS; defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", - SSEPackedDouble>, PD; + SSEPackedDouble, 1>, PD; defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", SSEPackedSingle>, PS; @@ -2810,84 +2866,6 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, // SSE 1 & 2 - Logical Instructions //===----------------------------------------------------------------------===// -// Multiclass for scalars using the X86 logical operation aliases for FP. -multiclass sse12_fp_packed_scalar_logical_alias< - bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { - defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>, - PS, VEX_4V; - - defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>, - PD, VEX_4V; - - let Constraints = "$src1 = $dst" in { - defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, - f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS; - - defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, - f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD; - } -} - -let isCodeGenOnly = 1 in { - defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand, - SSE_BIT_ITINS_P>; - defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for, - SSE_BIT_ITINS_P>; - defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor, - SSE_BIT_ITINS_P>; - - let isCommutable = 0 in - defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn, - SSE_BIT_ITINS_P>; -} - -// Multiclass for vectors using the X86 logical operation aliases for FP. -multiclass sse12_fp_packed_vector_logical_alias< - bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { - let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { - defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, - PS, VEX_4V; - - defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>, - PD, VEX_4V; - - defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>, - PS, VEX_4V, VEX_L; - - defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>, - PD, VEX_4V, VEX_L; - } - - let Constraints = "$src1 = $dst" in { - defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, - v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>, - PS; - - defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, - v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>, - PD; - } -} - -let isCodeGenOnly = 1 in { - defm FvAND : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand, - SSE_BIT_ITINS_P>; - defm FvOR : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for, - SSE_BIT_ITINS_P>; - defm FvXOR : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor, - SSE_BIT_ITINS_P>; - - let isCommutable = 0 in - defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn, - SSE_BIT_ITINS_P>; -} - /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops /// multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, @@ -2895,7 +2873,8 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f256mem, - [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), + (bc_v4i64 (v8f32 VR256:$src2))))], [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L; @@ -2907,12 +2886,10 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, (loadv4i64 addr:$src2)))], 0>, PD, VEX_4V, VEX_L; - // In AVX no need to add a pattern for 128-bit logical rr ps, because they - // are all promoted to v2i64, and the patterns are covered by the int - // version. This is needed in SSE only, because v2i64 isn't supported on - // SSE1, but only on SSE2. defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, - !strconcat(OpcodeStr, "ps"), f128mem, [], + !strconcat(OpcodeStr, "ps"), f128mem, + [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), + (bc_v2i64 (v4f32 VR128:$src2))))], [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V; @@ -2928,7 +2905,8 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, - [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), + (bc_v2i64 (v4f32 VR128:$src2))))], [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), (memopv2i64 addr:$src2)))]>, PS; @@ -2947,19 +2925,124 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; let isCommutable = 0 in defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>; -// AVX1 requires type coercions in order to fold loads directly into logical -// operations. +// If only AVX1 is supported, we need to handle integer operations with +// floating point instructions since the integer versions aren't available. let Predicates = [HasAVX1Only] in { - def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))), + def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), + (VANDPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), + (VORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), + (VXORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), + (VANDNPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), (VANDPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))), + def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), (VORPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))), + def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), (VXORPSYrm VR256:$src1, addr:$src2)>; - def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))), + def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), (VANDNPSYrm VR256:$src1, addr:$src2)>; } +let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { + // Use packed logical operations for scalar ops. + def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (VANDPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (VORPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (VXORPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (VANDNPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + + def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (VANDPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (VORPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (VXORPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (VANDNPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; +} + +let Predicates = [UseSSE1] in { + // Use packed logical operations for scalar ops. + def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (ANDPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (ORPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (XORPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; + def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS (ANDNPSrr + (COPY_TO_REGCLASS FR32:$src1, VR128), + (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>; +} + +let Predicates = [UseSSE2] in { + // Use packed logical operations for scalar ops. + def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (ANDPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (ORPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (XORPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; + def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS (ANDNPDrr + (COPY_TO_REGCLASS FR64:$src1, VR128), + (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>; +} + +// Patterns for packed operations when we don't have integer type available. +def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), + (ANDPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), + (ORPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), + (XORPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), + (ANDNPSrr VR128:$src1, VR128:$src2)>; + +def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), + (ANDPSrm VR128:$src1, addr:$src2)>; +def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), + (ORPSrm VR128:$src1, addr:$src2)>; +def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), + (XORPSrm VR128:$src1, addr:$src2)>; +def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), + (ANDNPSrm VR128:$src1, addr:$src2)>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Arithmetic Instructions //===----------------------------------------------------------------------===// @@ -3025,20 +3108,22 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, + SDPatternOperator IntSS, + SDPatternOperator IntSD, SizeItins itins> { - defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, + defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128, + !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG; - defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, + defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128, + !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { - defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, + defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128, + !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, SSEPackedSingle, itins.s>, XS; - defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, - !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, + defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128, + !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, SSEPackedDouble, itins.d>, XD; } } @@ -3046,23 +3131,29 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, // Binary Arithmetic instructions defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; + basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag, + SSE_ALU_ITINS_S>; defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>, basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, - basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; + basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag, + SSE_MUL_ITINS_S>; let isCommutable = 0 in { defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; + basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag, + SSE_ALU_ITINS_S>; defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; + basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag, + SSE_DIV_ITINS_S>; defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>; + basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss, + int_x86_sse2_max_sd, SSE_ALU_ITINS_S>; defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>; + basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss, + int_x86_sse2_min_sd, SSE_ALU_ITINS_S>; } let isCodeGenOnly = 1 in { @@ -3145,9 +3236,15 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { } - // Repeat everything for AVX, except for the movss + scalar combo... - // because that one shouldn't occur with AVX codegen? - let Predicates = [HasAVX] in { + // Repeat everything for AVX. + let Predicates = [UseAVX] in { + // extracted scalar math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), @@ -3203,7 +3300,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { } // Repeat everything for AVX. - let Predicates = [HasAVX] in { + let Predicates = [UseAVX] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), @@ -3287,8 +3384,8 @@ def SSE_RCPS : OpndItins< /// the HW instructions are 2 operand / destructive. multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, ValueType vt, ValueType ScalarVT, - X86MemOperand x86memop, Operand vec_memop, - ComplexPattern mem_cpat, Intrinsic Intr, + X86MemOperand x86memop, + Intrinsic Intr, SDNode OpNode, Domain d, OpndItins itins, Predicate target, string Suffix> { let hasSideEffects = 0 in { @@ -3308,23 +3405,17 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; let mayLoad = 1 in - def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2), + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } let Predicates = [target] in { - def : Pat<(vt (OpNode mem_cpat:$src)), - (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int) - (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>; // These are unary operations, but they are modeled as having 2 source operands // because the high elements of the destination are unchanged in SSE. def : Pat<(Intr VR128:$src), (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; - def : Pat<(Intr (load addr:$src)), - (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m) - addr:$src), VR128))>; } // We don't want to fold scalar loads into these instructions unless // optimizing for size. This is because the folded instruction will have a @@ -3334,16 +3425,15 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, // which has a clobber before the rcp, vs. // rcpss mem, %xmm0 let Predicates = [target, OptForSize] in { - def : Pat<(Intr mem_cpat:$src), + def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))), (!cast<Instruction>(NAME#Suffix##m_Int) - (vt (IMPLICIT_DEF)), mem_cpat:$src)>; + (vt (IMPLICIT_DEF)), addr:$src2)>; } } multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, ValueType vt, ValueType ScalarVT, - X86MemOperand x86memop, Operand vec_memop, - ComplexPattern mem_cpat, + X86MemOperand x86memop, Intrinsic Intr, SDNode OpNode, Domain d, OpndItins itins, string Suffix> { let hasSideEffects = 0 in { @@ -3361,7 +3451,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, []>, Sched<[itins.Sched.Folded]>; let mayLoad = 1 in def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, vec_memop:$src2), + (ins VR128:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -3382,21 +3472,18 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, } let Predicates = [HasAVX] in { def : Pat<(Intr VR128:$src), - (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)), + (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; } let Predicates = [HasAVX, OptForSize] in { - def : Pat<(Intr mem_cpat:$src), + def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))), (!cast<Instruction>("V"#NAME#Suffix##m_Int) - (vt (IMPLICIT_DEF)), mem_cpat:$src)>; + (vt (IMPLICIT_DEF)), addr:$src2)>; } let Predicates = [UseAVX, OptForSize] in { def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(vt (OpNode mem_cpat:$src)), - (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), - mem_cpat:$src)>; } } @@ -3475,11 +3562,10 @@ let Predicates = [HasAVX] in { multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, - ssmem, sse_load_f32, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, SSEPackedSingle, itins, UseSSE1, "SS">, XS; defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, - f32mem, ssmem, sse_load_f32, + f32mem, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG; } @@ -3487,11 +3573,10 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, - sdmem, sse_load_f64, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, - f64mem, sdmem, sse_load_f64, + f64mem, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), OpNode, SSEPackedDouble, itins, "SD">, XD, VEX_4V, VEX_LIG; @@ -3805,13 +3890,14 @@ def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), } let SchedRW = [WriteMove] in { -let hasSideEffects = 0 in +let hasSideEffects = 0 in { def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; +} // For Disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { @@ -3874,85 +3960,12 @@ def SSE_PMADD : OpndItins< let ExeDomain = SSEPackedInt in { // SSE integer instructions -multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, - RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, - OpndItins itins, - bit IsCommutable = 0, - bit Is2Addr = 1> { - let isCommutable = IsCommutable in - def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>, - Sched<[itins.Sched]>; - def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))], - itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; -} - -multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, - Intrinsic IntId256, OpndItins itins, - bit IsCommutable = 0> { -let Predicates = [HasAVX] in - defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128, - VR128, loadv2i64, i128mem, itins, - IsCommutable, 0>, VEX_4V; - -let Constraints = "$src1 = $dst" in - defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64, - i128mem, itins, IsCommutable, 1>; - -let Predicates = [HasAVX2] in - defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256, - VR256, loadv4i64, i256mem, itins, - IsCommutable, 0>, VEX_4V, VEX_L; -} - -multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, - string OpcodeStr, SDNode OpNode, - SDNode OpNode2, RegisterClass RC, - ValueType DstVT, ValueType SrcVT, - PatFrag ld_frag, ShiftOpndItins itins, - bit Is2Addr = 1> { - // src2 is always 128-bit - def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, VR128:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))], - itins.rr>, Sched<[WriteVecShift]>; - def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, i128mem:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (DstVT (OpNode RC:$src1, - (SrcVT (bitconvert (ld_frag addr:$src2))))))], itins.rm>, - Sched<[WriteVecShiftLd, ReadAfterLd]>; - def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), - (ins RC:$src1, u8imm:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>, - Sched<[WriteVecShift]>; -} - /// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType DstVT, ValueType SrcVT, RegisterClass RC, PatFrag memop_frag, X86MemOperand x86memop, - OpndItins itins, - bit IsCommutable = 0, bit Is2Addr = 1> { - let isCommutable = IsCommutable in + OpndItins itins, bit Is2Addr = 1> { + let isCommutable = 1 in def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, @@ -3984,9 +3997,9 @@ defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8, defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, @@ -4022,184 +4035,141 @@ defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; -// Intrinsic forms -defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, - int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in +defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, + loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V; + +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in +defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, + VR256, loadv4i64, i256mem, SSE_PMADD, + 0>, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, + memopv2i64, i128mem, SSE_PMADD>; let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, - loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>, VEX_4V; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, - loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, + loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, - memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; + memopv2i64, i128mem, SSE_INTALU_ITINS_P>; let Predicates = [HasAVX, NoVLX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, - loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>, VEX_4V; let Predicates = [HasAVX2, NoVLX] in defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, VR256, loadv4i64, i256mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; + SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, - memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>; + memopv2i64, i128mem, SSE_INTMUL_ITINS_P>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// -let Predicates = [HasAVX, NoVLX] in { -defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, - VR128, v4i32, v4i32, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, - VR128, v2i64, v2i64, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; - -defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, - VR128, v4i32, v4i32, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, - VR128, v2i64, v2i64, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; - -defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, - VR128, v4i32, v4i32, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -} // Predicates = [HasAVX, NoVLX] +multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, SDNode OpNode, + SDNode OpNode2, RegisterClass RC, + ValueType DstVT, ValueType SrcVT, + PatFrag ld_frag, bit Is2Addr = 1> { + // src2 is always 128-bit + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))], + SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode RC:$src1, + (SrcVT (bitconvert (ld_frag addr:$src2))))))], + SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; + def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), + (ins RC:$src1, u8imm:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], + SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>; +} -let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { -defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -} // Predicates = [HasAVX, NoVLX_Or_NoBWI] - - -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] , - Predicates = [HasAVX, NoVLX_Or_NoBWI]in { - // 128-bit logical shifts. - def VPSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), - "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, - (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>, - VEX_4V; - def VPSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), - "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, - (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, - VEX_4V; - // PSRADQri doesn't exist in SSE[1-3]. -} // Predicates = [HasAVX, NoVLX_Or_NoBWI] +multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, SDNode OpNode, + SDNode OpNode2, ValueType DstVT128, + ValueType DstVT256, ValueType SrcVT, + Predicate prd> { +let Predicates = [HasAVX, prd] in + defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), + OpNode, OpNode2, VR128, DstVT128, SrcVT, + loadv2i64, 0>, VEX_4V; +let Predicates = [HasAVX2, prd] in + defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), + OpNode, OpNode2, VR256, DstVT256, SrcVT, + loadv2i64, 0>, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, + VR128, DstVT128, SrcVT, memopv2i64>; +} -let Predicates = [HasAVX2, NoVLX] in { -defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, - VR256, v8i32, v4i32, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, - VR256, v4i64, v2i64, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; - -defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, - VR256, v8i32, v4i32, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, - VR256, v4i64, v2i64, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; - -defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, - VR256, v8i32, v4i32, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -}// Predicates = [HasAVX2, NoVLX] +multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, + SDNode OpNode, RegisterClass RC, ValueType VT, + bit Is2Addr = 1> { + def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>; +} -let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { -defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR256, v16i16, v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR256, v16i16, v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR256, v16i16, v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -}// Predicates = [HasAVX2, NoVLX_Or_NoBWI] - -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 , - Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - // 256-bit logical shifts. - def VPSLLDQYri : PDIi8<0x73, MRM7r, - (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), - "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR256:$dst, - (v32i8 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>, - VEX_4V, VEX_L; - def VPSRLDQYri : PDIi8<0x73, MRM3r, - (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), - "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR256:$dst, - (v32i8 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, - VEX_4V, VEX_L; - // PSRADQYri doesn't exist in SSE[1-3]. -} // Predicates = [HasAVX2, NoVLX_Or_NoBWI] +multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, + SDNode OpNode> { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in + defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, + VR128, v16i8, 0>, VEX_4V; +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in + defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, + VR256, v32i8, 0>, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>; +} -let Constraints = "$src1 = $dst" in { -defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, memopv2i64, - SSE_INTSHIFT_ITINS_P>; -defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, - VR128, v4i32, v4i32, memopv2i64, - SSE_INTSHIFT_ITINS_P>; -defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, - VR128, v2i64, v2i64, memopv2i64, - SSE_INTSHIFT_ITINS_P>; - -defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, memopv2i64, - SSE_INTSHIFT_ITINS_P>; -defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, - VR128, v4i32, v4i32, memopv2i64, - SSE_INTSHIFT_ITINS_P>; -defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, - VR128, v2i64, v2i64, memopv2i64, - SSE_INTSHIFT_ITINS_P>; - -defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, memopv2i64, - SSE_INTSHIFT_ITINS_P>; -defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, - VR128, v4i32, v4i32, memopv2i64, - SSE_INTSHIFT_ITINS_P>; - -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { - // 128-bit logical shifts. - def PSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), - "pslldq\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, - (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_INTSHDQ_P_RI>; - def PSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), - "psrldq\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, - (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_INTSHDQ_P_RI>; +let ExeDomain = SSEPackedInt in { + defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, + v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>; + defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, + v4i32, v8i32, v4i32, NoVLX>; + defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, + v2i64, v4i64, v2i64, NoVLX>; + + defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, + v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>; + defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, + v4i32, v8i32, v4i32, NoVLX>; + defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, + v2i64, v4i64, v2i64, NoVLX>; + + defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, + v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>; + defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, + v4i32, v8i32, v4i32, NoVLX>; + + defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>; + defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>; // PSRADQri doesn't exist in SSE[1-3]. -} -} // Constraints = "$src1 = $dst" +} // ExeDomain = SSEPackedInt //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Comparison Instructions @@ -4651,6 +4621,7 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), //===---------------------------------------------------------------------===// // Move Int Doubleword to Packed Double Int // +let ExeDomain = SSEPackedInt in { def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -4701,11 +4672,12 @@ def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; +} // ExeDomain = SSEPackedInt //===---------------------------------------------------------------------===// // Move Int Doubleword to Single Scalar // -let isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))], @@ -4725,11 +4697,12 @@ let isCodeGenOnly = 1 in { "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; -} +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 //===---------------------------------------------------------------------===// // Move Packed Doubleword Int to Packed Double Int // +let ExeDomain = SSEPackedInt in { def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (extractelt (v4i32 VR128:$src), @@ -4751,6 +4724,7 @@ def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), [(store (i32 (extractelt (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; +} // ExeDomain = SSEPackedInt def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; @@ -4767,6 +4741,7 @@ def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), //===---------------------------------------------------------------------===// // Move Packed Doubleword Int first element to Doubleword Int // +let ExeDomain = SSEPackedInt in { let SchedRW = [WriteMove] in { def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", @@ -4791,11 +4766,12 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; +} // ExeDomain = SSEPackedInt //===---------------------------------------------------------------------===// // Bitcast FR64 <-> GR64 // -let isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { let Predicates = [UseAVX] in def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", @@ -4822,12 +4798,12 @@ let isCodeGenOnly = 1 in { "movq\t{$src, $dst|$dst, $src}", [(store (i64 (bitconvert FR64:$src)), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; -} +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 //===---------------------------------------------------------------------===// // Move Scalar Single to Double Int // -let isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))], @@ -4844,7 +4820,7 @@ let isCodeGenOnly = 1 in { "movd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32:$src)), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; -} +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 let Predicates = [UseAVX] in { let AddedComplexity = 15 in { @@ -4867,9 +4843,13 @@ let Predicates = [UseAVX] in { (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (VMOVDI2PDIrm addr:$src)>; def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; + def : Pat<(v8i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; } // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, @@ -4892,6 +4872,8 @@ let Predicates = [UseSSE2] in { (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (MOVDI2PDIrm addr:$src)>; } } @@ -4960,43 +4942,30 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}", (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>; -//===---------------------------------------------------------------------===// -// Store / copy lower 64-bits of a XMM register. -// -let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in { -def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v2i64 (X86vzmovl (v2i64 (scalar_to_vector - (loadi64 addr:$src))))))], - IIC_SSE_MOVDQ>, - XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>; - -def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - "movq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v2i64 (X86vzmovl (v2i64 (scalar_to_vector - (loadi64 addr:$src))))))], - IIC_SSE_MOVDQ>, - XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; -} // ExeDomain, isCodeGenOnly, AddedComplexity - let Predicates = [UseAVX], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VMOVQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), - (VMOVZQI2PQIrm addr:$src)>; + (VMOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), - (VMOVZQI2PQIrm addr:$src)>; + (VMOVQI2PQIrm addr:$src)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>; def : Pat<(v4i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>; } let Predicates = [UseSSE2], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (MOVQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (MOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), - (MOVZQI2PQIrm addr:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; + (MOVQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; } //===---------------------------------------------------------------------===// @@ -5018,24 +4987,6 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), XS, Requires<[UseSSE2]>; } // ExeDomain, SchedRW -let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { -let AddedComplexity = 20 in -def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (X86vzmovl - (loadv2i64 addr:$src))))], - IIC_SSE_MOVDQ>, - XS, VEX, Requires<[UseAVX]>; -let AddedComplexity = 20 in { -def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (X86vzmovl - (loadv2i64 addr:$src))))], - IIC_SSE_MOVDQ>, - XS, Requires<[UseSSE2]>; -} -} // ExeDomain, isCodeGenOnly, SchedRW - let AddedComplexity = 20 in { let Predicates = [UseAVX] in { def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), @@ -5167,12 +5118,12 @@ let Predicates = [HasAVX] in { (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; } -let Predicates = [UseAVX, OptForSize] in { - def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), - (VMOVDDUPrm addr:$src)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), - (VMOVDDUPrm addr:$src)>; -} +let Predicates = [HasAVX, NoVLX] in +def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +let Predicates = [HasAVX1Only] in +def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), @@ -5370,35 +5321,35 @@ let Constraints = "$src1 = $dst" in { /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, SDNode OpNode, PatFrag ld_frag> { - def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (vt (OpNode VR128:$src)))], - IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>; + def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (vt (OpNode VR128:$src)))], + IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>; - def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, - (vt (OpNode (bitconvert (ld_frag addr:$src)))))], - IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>; + def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (vt (OpNode (bitconvert (ld_frag addr:$src)))))], + IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>; } /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, SDNode OpNode> { - def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, - Sched<[WriteVecALU]>; + def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, + Sched<[WriteVecALU]>; - def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), - (ins i256mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, - (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>, - Sched<[WriteVecALULd]>; + def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>, + Sched<[WriteVecALULd]>; } // Helper fragments to match sext vXi1 to vXiY. @@ -5419,19 +5370,21 @@ let Predicates = [HasAVX, NoVLX] in { defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { def : Pat<(xor (bc_v2i64 (v16i1sextv16i8)), (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), - (VPABSBrr128 VR128:$src)>; + (VPABSBrr VR128:$src)>; def : Pat<(xor (bc_v2i64 (v8i1sextv8i16)), (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), - (VPABSWrr128 VR128:$src)>; + (VPABSWrr VR128:$src)>; +} +let Predicates = [HasAVX, NoVLX] in { def : Pat<(xor (bc_v2i64 (v4i1sextv4i32)), (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), - (VPABSDrr128 VR128:$src)>; + (VPABSDrr VR128:$src)>; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { @@ -5442,19 +5395,21 @@ let Predicates = [HasAVX2, NoVLX] in { defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(xor (bc_v4i64 (v32i1sextv32i8)), (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))), - (VPABSBrr256 VR256:$src)>; + (VPABSBYrr VR256:$src)>; def : Pat<(xor (bc_v4i64 (v16i1sextv16i16)), (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))), - (VPABSWrr256 VR256:$src)>; + (VPABSWYrr VR256:$src)>; +} +let Predicates = [HasAVX2, NoVLX] in { def : Pat<(xor (bc_v4i64 (v8i1sextv8i32)), (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))), - (VPABSDrr256 VR256:$src)>; + (VPABSDYrr VR256:$src)>; } defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>; @@ -5465,15 +5420,15 @@ let Predicates = [UseSSSE3] in { def : Pat<(xor (bc_v2i64 (v16i1sextv16i8)), (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), - (PABSBrr128 VR128:$src)>; + (PABSBrr VR128:$src)>; def : Pat<(xor (bc_v2i64 (v8i1sextv8i16)), (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), - (PABSWrr128 VR128:$src)>; + (PABSWrr VR128:$src)>; def : Pat<(xor (bc_v2i64 (v4i1sextv4i32)), (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), - (PABSDrr128 VR128:$src)>; + (PABSDrr VR128:$src)>; } //===---------------------------------------------------------------------===// @@ -5506,16 +5461,16 @@ def SSE_PMULHRSW : OpndItins< /// SS3I_binop_rm - Simple SSSE3 bin op multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, OpndItins itins, - bit Is2Addr = 1> { + ValueType DstVT, ValueType OpVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, bit Is2Addr = 1> { let isCommutable = 1 in def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, + [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>, Sched<[itins.Sched]>; def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), @@ -5523,7 +5478,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, + (DstVT (OpNode (OpVT RC:$src1), (bitconvert (memop_frag addr:$src2)))))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -5568,18 +5523,32 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, Sched<[Sched.Folded, ReadAfterLd]>; } +let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { +let isCommutable = 0 in { + defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, + VR128, loadv2i64, i128mem, + SSE_PSHUFB, 0>, VEX_4V; + defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, + v16i8, VR128, loadv2i64, i128mem, + SSE_PMADD, 0>, VEX_4V; +} +defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, + VR128, loadv2i64, i128mem, + SSE_PMULHRSW, 0>, VEX_4V; +} + let ImmT = NoImm, Predicates = [HasAVX] in { let isCommutable = 0 in { - defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128, + defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, loadv2i64, i128mem, SSE_PHADDSUBW, 0>, VEX_4V; - defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128, + defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, loadv2i64, i128mem, SSE_PHADDSUBD, 0>, VEX_4V; - defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128, + defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, loadv2i64, i128mem, SSE_PHADDSUBW, 0>, VEX_4V; - defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128, + defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, loadv2i64, i128mem, SSE_PHADDSUBD, 0>, VEX_4V; defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", @@ -5591,36 +5560,41 @@ let isCommutable = 0 in { defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", int_x86_ssse3_psign_d_128, SSE_PSIGN, loadv2i64, 0>, VEX_4V; - defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128, - loadv2i64, i128mem, - SSE_PSHUFB, 0>, VEX_4V; defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", int_x86_ssse3_phsub_sw_128, SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; - defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", - int_x86_ssse3_pmadd_ub_sw_128, - SSE_PMADD, loadv2i64, 0>, VEX_4V; } -defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", - int_x86_ssse3_pmul_hr_sw_128, - SSE_PMULHRSW, loadv2i64, 0>, VEX_4V; +} + +let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { +let isCommutable = 0 in { + defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, + VR256, loadv4i64, i256mem, + SSE_PSHUFB, 0>, VEX_4V, VEX_L; + defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, + v32i8, VR256, loadv4i64, i256mem, + SSE_PMADD, 0>, VEX_4V, VEX_L; +} +defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, + VR256, loadv4i64, i256mem, + SSE_PMULHRSW, 0>, VEX_4V, VEX_L; } let ImmT = NoImm, Predicates = [HasAVX2] in { let isCommutable = 0 in { - defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256, - loadv4i64, i256mem, + defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, + VR256, loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; - defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256, + defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; - defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256, - loadv4i64, i256mem, + defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, + VR256, loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; - defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256, + defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSIGNBY : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, @@ -5629,34 +5603,25 @@ let isCommutable = 0 in { WriteVecALU>, VEX_4V, VEX_L; defm VPSIGNDY : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, WriteVecALU>, VEX_4V, VEX_L; - defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, - loadv4i64, i256mem, - SSE_PSHUFB, 0>, VEX_4V, VEX_L; defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", int_x86_avx2_phadd_sw, WriteVecALU>, VEX_4V, VEX_L; defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", int_x86_avx2_phsub_sw, WriteVecALU>, VEX_4V, VEX_L; - defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", - int_x86_avx2_pmadd_ub_sw, - WriteVecIMul>, VEX_4V, VEX_L; } -defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", - int_x86_avx2_pmul_hr_sw, - WriteVecIMul>, VEX_4V, VEX_L; } // None of these have i8 immediate fields. let ImmT = NoImm, Constraints = "$src1 = $dst" in { let isCommutable = 0 in { - defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128, + defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, memopv2i64, i128mem, SSE_PHADDSUBW>; - defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128, + defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, memopv2i64, i128mem, SSE_PHADDSUBD>; - defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128, + defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, memopv2i64, i128mem, SSE_PHADDSUBW>; - defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128, + defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, memopv2i64, i128mem, SSE_PHADDSUBD>; defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, SSE_PSIGN, memopv2i64>; @@ -5664,7 +5629,7 @@ let isCommutable = 0 in { SSE_PSIGN, memopv2i64>; defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, SSE_PSIGN, memopv2i64>; - defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128, + defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, memopv2i64, i128mem, SSE_PSHUFB>; defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", int_x86_ssse3_phadd_sw_128, @@ -5672,13 +5637,12 @@ let isCommutable = 0 in { defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", int_x86_ssse3_phsub_sw_128, SSE_PHADDSUBSW, memopv2i64>; - defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", - int_x86_ssse3_pmadd_ub_sw_128, - SSE_PMADD, memopv2i64>; + defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, + v16i8, VR128, memopv2i64, i128mem, + SSE_PMADD>; } -defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", - int_x86_ssse3_pmul_hr_sw_128, - SSE_PMULHRSW, memopv2i64>; +defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, + VR128, memopv2i64, i128mem, SSE_PMULHRSW>; } //===---------------------------------------------------------------------===// @@ -5895,8 +5859,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), - (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; } let Predicates = [HasAVX, NoVLX] in { def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -5923,8 +5885,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), - (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; @@ -5941,8 +5901,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), - (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; } } @@ -6342,10 +6300,10 @@ let Predicates = [UseAVX] in { // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// -multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - PatFrag mem_frag32, PatFrag mem_frag64, - Intrinsic V4F32Int, Intrinsic V2F64Int> { +multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + PatFrag mem_frag32, PatFrag mem_frag64, + Intrinsic V4F32Int, Intrinsic V2F64Int> { let ExeDomain = SSEPackedSingle in { // Intrinsic operation, reg. // Vector intrinsic operation, reg @@ -6386,24 +6344,73 @@ let ExeDomain = SSEPackedDouble in { } // ExeDomain = SSEPackedDouble } -multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, - string OpcodeStr, - Intrinsic F32Int, - Intrinsic F64Int, bit Is2Addr = 1> { -let ExeDomain = GenericDomain in { - // Operation, reg. - let hasSideEffects = 0 in +multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, + string OpcodeStr> { +let ExeDomain = GenericDomain, hasSideEffects = 0 in { def SSr : SS4AIi8<opcss, MRMSrcReg, - (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), - !if(Is2Addr, - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[WriteFAdd]>; - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in + let mayLoad = 1 in + def SSm : SS4AIi8<opcss, MRMSrcMem, + (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[WriteFAddLd, ReadAfterLd]>; + + def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SDm : SS4AIi8<opcsd, MRMSrcMem, + (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = GenericDomain, hasSideEffects = 0 +} + +multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, + string OpcodeStr> { +let ExeDomain = GenericDomain, hasSideEffects = 0 in { + def SSr : SS4AIi8<opcss, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SSm : SS4AIi8<opcss, MRMSrcMem, + (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[WriteFAddLd, ReadAfterLd]>; + + def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SDm : SS4AIi8<opcsd, MRMSrcMem, + (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = GenericDomain, hasSideEffects = 0 +} + +multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, + Intrinsic F32Int, + Intrinsic F64Int, bit Is2Addr = 1> { +let ExeDomain = GenericDomain, isCodeGenOnly = 1 in { def SSr_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, @@ -6414,8 +6421,7 @@ let ExeDomain = GenericDomain in { [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>, Sched<[WriteFAdd]>; - // Intrinsic operation, mem. - def SSm : SS4AIi8<opcss, MRMSrcMem, + def SSm_Int : SS4AIi8<opcss, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -6426,19 +6432,6 @@ let ExeDomain = GenericDomain in { (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, Sched<[WriteFAddLd, ReadAfterLd]>; - // Operation, reg. - let hasSideEffects = 0 in - def SDr : SS4AIi8<opcsd, MRMSrcReg, - (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), - !if(Is2Addr, - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - []>, Sched<[WriteFAdd]>; - - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, @@ -6449,8 +6442,7 @@ let ExeDomain = GenericDomain in { [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>, Sched<[WriteFAdd]>; - // Intrinsic operation, mem. - def SDm : SS4AIi8<opcsd, MRMSrcMem, + def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, @@ -6460,23 +6452,24 @@ let ExeDomain = GenericDomain in { [(set VR128:$dst, (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, Sched<[WriteFAddLd, ReadAfterLd]>; -} // ExeDomain = GenericDomain +} // ExeDomain = GenericDomain, isCodeGenOnly = 1 } // FP round - roundss, roundps, roundsd, roundpd let Predicates = [HasAVX] in { // Intrinsic form - defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, - loadv4f32, loadv2f64, - int_x86_sse41_round_ps, - int_x86_sse41_round_pd>, VEX; - defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, - loadv8f32, loadv4f64, - int_x86_avx_round_ps_256, - int_x86_avx_round_pd_256>, VEX, VEX_L; - defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", - int_x86_sse41_round_ss, - int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, + loadv4f32, loadv2f64, + int_x86_sse41_round_ps, + int_x86_sse41_round_pd>, VEX; + defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, + loadv8f32, loadv4f64, + int_x86_avx_round_ps_256, + int_x86_avx_round_pd_256>, VEX, VEX_L; + defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", + int_x86_sse41_round_ss, + int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; } let Predicates = [UseAVX] in { @@ -6548,34 +6541,37 @@ let Predicates = [HasAVX] in { (VROUNDYPDr VR256:$src, (i32 0xB))>; } -defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, - memopv4f32, memopv2f64, - int_x86_sse41_round_ps, int_x86_sse41_round_pd>; +defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, + memopv4f32, memopv2f64, int_x86_sse41_round_ps, + int_x86_sse41_round_pd>; + +defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">; + let Constraints = "$src1 = $dst" in -defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", +defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", int_x86_sse41_round_ss, int_x86_sse41_round_sd>; let Predicates = [UseSSE41] in { def : Pat<(ffloor FR32:$src), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; + (ROUNDSSr FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; + (ROUNDSDr FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + (ROUNDSSr FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + (ROUNDSDr FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; + (ROUNDSSr FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; + (ROUNDSDr FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + (ROUNDSSr FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + (ROUNDSDr FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; + (ROUNDSSr FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; + (ROUNDSDr FR64:$src, (i32 0xB))>; def : Pat<(v4f32 (ffloor VR128:$src)), (ROUNDPSr VR128:$src, (i32 0x9))>; @@ -6867,10 +6863,10 @@ let Constraints = "$src1 = $dst" in { let Predicates = [HasAVX, NoVLX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, - memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, + loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>, VEX_4V; defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, - memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; } let Predicates = [HasAVX2] in { @@ -7029,22 +7025,22 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, Intrinsic IntId, X86FoldableSchedWrite Sched> { - def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst), + def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], - NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, + NoItinerary, SSEPackedInt>, TAPD, VEX_4V, Sched<[Sched]>; - def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), + def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), RC:$src3))], - NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, + NoItinerary, SSEPackedInt>, TAPD, VEX_4V, Sched<[Sched.Folded, ReadAfterLd]>; } @@ -7139,17 +7135,6 @@ let Predicates = [UseAVX] in { (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; } - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), - sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), - sub_xmm)>; - // These will incur an FP/int domain crossing penalty, but it may be the only // way without AVX2. Do not add any complexity because we may be able to match // more optimal patterns defined earlier in this file. @@ -7744,6 +7729,7 @@ defm : pclmul_alias<"lqlq", 0x00>; let Predicates = [HasSSE4A] in { +let ExeDomain = SSEPackedInt in { let Constraints = "$src = $dst" in { def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), (ins VR128:$src, u8imm:$len, u8imm:$idx), @@ -7767,6 +7753,7 @@ def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, VR128:$mask))]>, XD; } +} // ExeDomain = SSEPackedInt // Non-temporal (unaligned) scalar stores. let AddedComplexity = 400 in { // Prefer non-temporal versions @@ -7832,23 +7819,50 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, v4f64, v2f64, WriteFShuffle256>, VEX_L; +//===----------------------------------------------------------------------===// +// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both +// halves of a 256-bit vector. +// let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, Sched<[WriteLoad]>, VEX, VEX_L; +let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), - "vbroadcastf128\t{$src, $dst|$dst, $src}", - [(set VR256:$dst, - (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>, + "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, Sched<[WriteFShuffleLd]>, VEX, VEX_L; -let Predicates = [HasAVX] in -def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), +let Predicates = [HasAVX2, NoVLX] in { +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI128 addr:$src)>; +def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), + (VBROADCASTI128 addr:$src)>; +def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTI128 addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTI128 addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { +def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), (VBROADCASTF128 addr:$src)>; +def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), + (VBROADCASTF128 addr:$src)>; +} +let Predicates = [HasAVX1Only] in { +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTF128 addr:$src)>; +} //===----------------------------------------------------------------------===// // VINSERTF128 - Insert packed floating-point values @@ -7865,63 +7879,29 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } -let Predicates = [HasAVX, NoVLX] in { -def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), - (iPTR imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), +multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, + PatFrag memop_frag> { + def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), (iPTR imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; + (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; + def : Pat<(vinsert128_insert:$ins (To VR256:$src1), + (From (bitconvert (memop_frag addr:$src2))), + (iPTR imm)), + (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +} -def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2), - (iPTR imm)), - (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), - (iPTR imm)), - (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; +let Predicates = [HasAVX, NoVLX] in { + defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>; + defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>; } let Predicates = [HasAVX1Only] in { -def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), - (iPTR imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), - (iPTR imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), - (iPTR imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), - (iPTR imm)), - (VINSERTF128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; - -def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), - (iPTR imm)), - (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), - (bc_v4i32 (loadv2i64 addr:$src2)), - (iPTR imm)), - (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), - (bc_v16i8 (loadv2i64 addr:$src2)), - (iPTR imm)), - (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), - (bc_v8i16 (loadv2i64 addr:$src2)), - (iPTR imm)), - (VINSERTF128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; + defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; + defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>; + defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>; + defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>; } //===----------------------------------------------------------------------===// @@ -7939,61 +7919,28 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), []>, Sched<[WriteStore]>, VEX, VEX_L; } +multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { + def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (To (!cast<Instruction>(InstrStr#rr) + (From VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; + def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), + (iPTR imm))), addr:$dst), + (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +} + // AVX1 patterns let Predicates = [HasAVX, NoVLX] in { -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v4f32 (VEXTRACTF128rr - (v8f32 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v2f64 (VEXTRACTF128rr - (v4f64 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; - -def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; -def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; + defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; + defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; } let Predicates = [HasAVX1Only] in { -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v2i64 (VEXTRACTF128rr - (v4i64 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v4i32 (VEXTRACTF128rr - (v8i32 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v8i16 (VEXTRACTF128rr - (v16i16 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v16i8 (VEXTRACTF128rr - (v32i8 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; - -def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; -def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; -def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; -def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTF128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; + defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; + defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; + defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; } //===----------------------------------------------------------------------===// @@ -8239,7 +8186,7 @@ let Predicates = [HasF16C] in { } // Patterns for matching conversions from float to half-float and vice versa. -let Predicates = [HasF16C] in { +let Predicates = [HasF16C, NoVLX] in { // Use MXCSR.RC for rounding instead of explicitly specifying the default // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the // configurations we support (the default). However, falling back to MXCSR is @@ -8334,7 +8281,7 @@ defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64, NoVLX>; -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. // This means we'll encounter truncated i32 loads; match that here. def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), @@ -8347,7 +8294,9 @@ let Predicates = [HasAVX2] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; +} +let Predicates = [HasAVX2] in { // Provide aliases for broadcast from the same register class that // automatically does the extract. def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), @@ -8361,36 +8310,38 @@ let Predicates = [HasAVX2] in { let Predicates = [HasAVX2, NoVLX] in { // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. - let AddedComplexity = 20 in { def : Pat<(v4f32 (X86VBroadcast FR32:$src)), (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; def : Pat<(v8f32 (X86VBroadcast FR32:$src)), (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; def : Pat<(v4f64 (X86VBroadcast FR64:$src)), (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; - } } -let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i8 (X86VBroadcast GR8:$src)), (VPBROADCASTBrr (COPY_TO_REGCLASS - (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit)), VR128))>; def : Pat<(v32i8 (X86VBroadcast GR8:$src)), (VPBROADCASTBYrr (COPY_TO_REGCLASS - (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit)), VR128))>; def : Pat<(v8i16 (X86VBroadcast GR16:$src)), (VPBROADCASTWrr (COPY_TO_REGCLASS - (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR16:$src, sub_16bit)), VR128))>; def : Pat<(v16i16 (X86VBroadcast GR16:$src)), (VPBROADCASTWYrr (COPY_TO_REGCLASS - (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR16:$src, sub_16bit)), VR128))>; } -let Predicates = [HasAVX2, NoVLX], AddedComplexity = 20 in { +let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i32 (X86VBroadcast GR32:$src)), (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), @@ -8418,13 +8369,13 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. -let Predicates = [HasAVX], AddedComplexity = 20 in { +let Predicates = [HasAVX, NoVLX] in { // 128bit broadcasts: def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; } -let Predicates = [HasAVX, NoVLX], AddedComplexity = 20 in { +let Predicates = [HasAVX1Only] in { def : Pat<(v4f32 (X86VBroadcast FR32:$src)), (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; def : Pat<(v8f32 (X86VBroadcast FR32:$src)), @@ -8560,42 +8511,10 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), } let Predicates = [HasAVX2, NoVLX] in { -def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), - (iPTR imm)), - (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), - (iPTR imm)), - (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), - (iPTR imm)), - (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), - (iPTR imm)), - (VINSERTI128rr VR256:$src1, VR128:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; - -def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), - (iPTR imm)), - (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), - (bc_v4i32 (loadv2i64 addr:$src2)), - (iPTR imm)), - (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), - (bc_v16i8 (loadv2i64 addr:$src2)), - (iPTR imm)), - (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), - (bc_v8i16 (loadv2i64 addr:$src2)), - (iPTR imm)), - (VINSERTI128rm VR256:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR256:$ins))>; + defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; + defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>; + defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>; + defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>; } //===----------------------------------------------------------------------===// @@ -8612,39 +8531,10 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), Sched<[WriteStore]>, VEX, VEX_L; let Predicates = [HasAVX2, NoVLX] in { -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v2i64 (VEXTRACTI128rr - (v4i64 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v4i32 (VEXTRACTI128rr - (v8i32 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v8i16 (VEXTRACTI128rr - (v16i16 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; -def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), - (v16i8 (VEXTRACTI128rr - (v32i8 VR256:$src1), - (EXTRACT_get_vextract128_imm VR128:$ext)))>; - -def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; -def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; -def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; -def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), - (iPTR imm))), addr:$dst), - (VEXTRACTI128mr addr:$dst, VR256:$src1, - (EXTRACT_get_vextract128_imm VR128:$ext))>; + defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; + defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; + defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; } //===----------------------------------------------------------------------===// @@ -8689,12 +8579,12 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)), (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; // masked load - def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), + def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)), (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; - def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), + def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT (bitconvert (ZeroVT immAllZerosV))))), (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; - def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))), + def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))), (!cast<Instruction>(BlendStr#"rr") RC:$src0, (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr), @@ -8719,6 +8609,51 @@ let Predicates = [HasAVX2] in { defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; } + +//===----------------------------------------------------------------------===// +// SubVector Broadcasts +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. + +let Predicates = [HasAVX2, NoVLX] in { +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2i64 VR128:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4i32 VR128:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v8i16 VR128:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v16i8 VR128:$src), 1)>; +} + +let Predicates = [HasAVX, NoVLX] in { +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2f64 VR128:$src), 1)>; +def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4f32 VR128:$src), 1)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2i64 VR128:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4i32 VR128:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v8i16 VR128:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v16i8 VR128:$src), 1)>; +} + //===----------------------------------------------------------------------===// // Variable Bit Shifts // @@ -8758,23 +8693,35 @@ let Predicates = [HasAVX2, NoVLX] in { defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; - let isCodeGenOnly = 1 in - defm VPSRAVD_Int : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; + + def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)), + (VPSRAVDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86vsrav VR128:$src1, + (bitconvert (loadv2i64 addr:$src2)))), + (VPSRAVDrm VR128:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)), + (VPSRAVDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86vsrav VR256:$src1, + (bitconvert (loadv4i64 addr:$src2)))), + (VPSRAVDYrm VR256:$src1, addr:$src2)>; } + + + //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, X86MemOperand memop128, X86MemOperand memop256> { - def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb), + def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), (ins VR128:$src1, memop128:$src2, VR128:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX_4VOp3; - def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb), + []>, VEX; + def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), (ins RC256:$src1, memop256:$src2, RC256:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX_4VOp3, VEX_L; + []>, VEX, VEX_L; } let mayLoad = 1, hasSideEffects = 0, Constraints diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index c1df978..e2be735 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -591,37 +591,38 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "ror{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>; + [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))], + IIC_SR>; def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "ror{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))], + [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))], IIC_SR>, OpSize16; def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "ror{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], + [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))], IIC_SR>, OpSize32; def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), "ror{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], + [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))], IIC_SR>; // Rotate by 1 def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "ror{b}\t$dst", - [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))], + [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))], IIC_SR>; def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "ror{w}\t$dst", - [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))], + [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))], IIC_SR>, OpSize16; def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "ror{l}\t$dst", - [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))], + [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))], IIC_SR>, OpSize32; def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "ror{q}\t$dst", - [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))], + [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))], IIC_SR>; } // Constraints = "$src = $dst", SchedRW @@ -873,19 +874,19 @@ let hasSideEffects = 0 in { multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> { let hasSideEffects = 0 in { - def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + def rr : I<0xF7, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - VEX_4VOp3, Sched<[WriteShift]>; + VEX, Sched<[WriteShift]>; let mayLoad = 1 in - def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + def rm : I<0xF7, MRMSrcMem4VOp3, + (outs RC:$dst), (ins x86memop:$src1, RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - VEX_4VOp3, - Sched<[WriteShiftLd, - // x86memop:$src1 - ReadDefault, ReadDefault, ReadDefault, ReadDefault, - ReadDefault, - // RC:$src1 - ReadAfterLd]>; + VEX, Sched<[WriteShiftLd, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src1 + ReadAfterLd]>; } } diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td index 6667bd2..9265d64 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -23,7 +23,7 @@ let Defs = [RAX, RCX, RDX] in // CPU flow control instructions -let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in { +let mayLoad = 1, mayStore = 0, hasSideEffects = 1 in { def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB; def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB; } @@ -481,8 +481,11 @@ let Defs = [EDX, EAX], Uses = [ECX] in def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; let Uses = [EDX, EAX, ECX] in - def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; -} + def XSETBV : I<0x01, MRM_D1, (outs), (ins), + "xsetbv", + [(int_x86_xsetbv ECX, EDX, EAX)]>, TB; + +} // HasXSAVE let Uses = [EDX, EAX] in { let Predicates = [HasXSAVE] in { diff --git a/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h new file mode 100755 index 0000000..415a891 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h @@ -0,0 +1,1162 @@ +//===-- X86InstrTablesInfo.h - X86 Instruction Tables -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains related X86 Instruction Information Tables. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H +#define LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H + +using namespace llvm; + +struct X86EvexToVexCompressTableEntry { + uint16_t EvexOpcode; + uint16_t VexOpcode; +}; + + + +// X86 EVEX encoded instructions that have a VEX 128 encoding +// (table format: <EVEX opcode, VEX-128 opcode>). +static const X86EvexToVexCompressTableEntry X86EvexToVex128CompressTable[] = { + // EVEX scalar with corresponding VEX. + { X86::Int_VCOMISDZrm , X86::Int_VCOMISDrm }, + { X86::Int_VCOMISDZrr , X86::Int_VCOMISDrr }, + { X86::Int_VCOMISSZrm , X86::Int_VCOMISSrm }, + { X86::Int_VCOMISSZrr , X86::Int_VCOMISSrr }, + { X86::Int_VUCOMISDZrm , X86::Int_VUCOMISDrm }, + { X86::Int_VUCOMISDZrr , X86::Int_VUCOMISDrr }, + { X86::Int_VUCOMISSZrm , X86::Int_VUCOMISSrm }, + { X86::Int_VUCOMISSZrr , X86::Int_VUCOMISSrr }, + { X86::VADDSDZrm , X86::VADDSDrm }, + { X86::VADDSDZrm_Int , X86::VADDSDrm_Int }, + { X86::VADDSDZrr , X86::VADDSDrr }, + { X86::VADDSDZrr_Int , X86::VADDSDrr_Int }, + { X86::VADDSSZrm , X86::VADDSSrm }, + { X86::VADDSSZrm_Int , X86::VADDSSrm_Int }, + { X86::VADDSSZrr , X86::VADDSSrr }, + { X86::VADDSSZrr_Int , X86::VADDSSrr_Int }, + { X86::VCOMISDZrm , X86::VCOMISDrm }, + { X86::VCOMISDZrr , X86::VCOMISDrr }, + { X86::VCOMISSZrm , X86::VCOMISSrm }, + { X86::VCOMISSZrr , X86::VCOMISSrr }, + { X86::VCVTSD2SI64Zrm , X86::VCVTSD2SI64rm }, + { X86::VCVTSD2SI64Zrr , X86::VCVTSD2SI64rr }, + { X86::VCVTSD2SIZrm , X86::VCVTSD2SIrm }, + { X86::VCVTSD2SIZrr , X86::VCVTSD2SIrr }, + { X86::VCVTSD2SSZrm , X86::VCVTSD2SSrm }, + { X86::VCVTSD2SSZrr , X86::VCVTSD2SSrr }, + { X86::VCVTSI2SDZrm , X86::VCVTSI2SDrm }, + { X86::VCVTSI2SDZrm_Int , X86::Int_VCVTSI2SDrm }, + { X86::VCVTSI2SDZrr , X86::VCVTSI2SDrr }, + { X86::VCVTSI2SDZrr_Int , X86::Int_VCVTSI2SDrr }, + { X86::VCVTSI2SSZrm , X86::VCVTSI2SSrm }, + { X86::VCVTSI2SSZrm_Int , X86::Int_VCVTSI2SSrm }, + { X86::VCVTSI2SSZrr , X86::VCVTSI2SSrr }, + { X86::VCVTSI2SSZrr_Int , X86::Int_VCVTSI2SSrr }, + { X86::VCVTSS2SDZrm , X86::VCVTSS2SDrm }, + { X86::VCVTSS2SDZrr , X86::VCVTSS2SDrr }, + { X86::VCVTSS2SI64Zrm , X86::VCVTSS2SI64rm }, + { X86::VCVTSS2SI64Zrr , X86::VCVTSS2SI64rr }, + { X86::VCVTSS2SIZrm , X86::VCVTSS2SIrm }, + { X86::VCVTSS2SIZrr , X86::VCVTSS2SIrr }, + { X86::VCVTTSD2SI64Zrm , X86::VCVTTSD2SI64rm }, + { X86::VCVTTSD2SI64Zrm_Int , X86::Int_VCVTTSD2SI64rm }, + { X86::VCVTTSD2SI64Zrr , X86::VCVTTSD2SI64rr }, + { X86::VCVTTSD2SI64Zrr_Int , X86::Int_VCVTTSD2SI64rr }, + { X86::VCVTTSD2SIZrm , X86::VCVTTSD2SIrm }, + { X86::VCVTTSD2SIZrm_Int , X86::Int_VCVTTSD2SIrm }, + { X86::VCVTTSD2SIZrr , X86::VCVTTSD2SIrr }, + { X86::VCVTTSD2SIZrr_Int , X86::Int_VCVTTSD2SIrr }, + { X86::VCVTTSS2SI64Zrm , X86::VCVTTSS2SI64rm }, + { X86::VCVTTSS2SI64Zrm_Int , X86::Int_VCVTTSS2SI64rm }, + { X86::VCVTTSS2SI64Zrr , X86::VCVTTSS2SI64rr }, + { X86::VCVTTSS2SI64Zrr_Int , X86::Int_VCVTTSS2SI64rr }, + { X86::VCVTTSS2SIZrm , X86::VCVTTSS2SIrm }, + { X86::VCVTTSS2SIZrm_Int , X86::Int_VCVTTSS2SIrm }, + { X86::VCVTTSS2SIZrr , X86::VCVTTSS2SIrr }, + { X86::VCVTTSS2SIZrr_Int , X86::Int_VCVTTSS2SIrr }, + { X86::VDIVSDZrm , X86::VDIVSDrm }, + { X86::VDIVSDZrm_Int , X86::VDIVSDrm_Int }, + { X86::VDIVSDZrr , X86::VDIVSDrr }, + { X86::VDIVSDZrr_Int , X86::VDIVSDrr_Int }, + { X86::VDIVSSZrm , X86::VDIVSSrm }, + { X86::VDIVSSZrm_Int , X86::VDIVSSrm_Int }, + { X86::VDIVSSZrr , X86::VDIVSSrr }, + { X86::VDIVSSZrr_Int , X86::VDIVSSrr_Int }, + { X86::VFMADD132SDZm , X86::VFMADD132SDm }, + { X86::VFMADD132SDZm_Int , X86::VFMADD132SDm_Int }, + { X86::VFMADD132SDZr , X86::VFMADD132SDr }, + { X86::VFMADD132SDZr_Int , X86::VFMADD132SDr_Int }, + { X86::VFMADD132SSZm , X86::VFMADD132SSm }, + { X86::VFMADD132SSZm_Int , X86::VFMADD132SSm_Int }, + { X86::VFMADD132SSZr , X86::VFMADD132SSr }, + { X86::VFMADD132SSZr_Int , X86::VFMADD132SSr_Int }, + { X86::VFMADD213SDZm , X86::VFMADD213SDm }, + { X86::VFMADD213SDZm_Int , X86::VFMADD213SDm_Int }, + { X86::VFMADD213SDZr , X86::VFMADD213SDr }, + { X86::VFMADD213SDZr_Int , X86::VFMADD213SDr_Int }, + { X86::VFMADD213SSZm , X86::VFMADD213SSm }, + { X86::VFMADD213SSZm_Int , X86::VFMADD213SSm_Int }, + { X86::VFMADD213SSZr , X86::VFMADD213SSr }, + { X86::VFMADD213SSZr_Int , X86::VFMADD213SSr_Int }, + { X86::VFMADD231SDZm , X86::VFMADD231SDm }, + { X86::VFMADD231SDZm_Int , X86::VFMADD231SDm_Int }, + { X86::VFMADD231SDZr , X86::VFMADD231SDr }, + { X86::VFMADD231SDZr_Int , X86::VFMADD231SDr_Int }, + { X86::VFMADD231SSZm , X86::VFMADD231SSm }, + { X86::VFMADD231SSZm_Int , X86::VFMADD231SSm_Int }, + { X86::VFMADD231SSZr , X86::VFMADD231SSr }, + { X86::VFMADD231SSZr_Int , X86::VFMADD231SSr_Int }, + { X86::VFMSUB132SDZm , X86::VFMSUB132SDm }, + { X86::VFMSUB132SDZm_Int , X86::VFMSUB132SDm_Int }, + { X86::VFMSUB132SDZr , X86::VFMSUB132SDr }, + { X86::VFMSUB132SDZr_Int , X86::VFMSUB132SDr_Int }, + { X86::VFMSUB132SSZm , X86::VFMSUB132SSm }, + { X86::VFMSUB132SSZm_Int , X86::VFMSUB132SSm_Int }, + { X86::VFMSUB132SSZr , X86::VFMSUB132SSr }, + { X86::VFMSUB132SSZr_Int , X86::VFMSUB132SSr_Int }, + { X86::VFMSUB213SDZm , X86::VFMSUB213SDm }, + { X86::VFMSUB213SDZm_Int , X86::VFMSUB213SDm_Int }, + { X86::VFMSUB213SDZr , X86::VFMSUB213SDr }, + { X86::VFMSUB213SDZr_Int , X86::VFMSUB213SDr_Int }, + { X86::VFMSUB213SSZm , X86::VFMSUB213SSm }, + { X86::VFMSUB213SSZm_Int , X86::VFMSUB213SSm_Int }, + { X86::VFMSUB213SSZr , X86::VFMSUB213SSr }, + { X86::VFMSUB213SSZr_Int , X86::VFMSUB213SSr_Int }, + { X86::VFMSUB231SDZm , X86::VFMSUB231SDm }, + { X86::VFMSUB231SDZm_Int , X86::VFMSUB231SDm_Int }, + { X86::VFMSUB231SDZr , X86::VFMSUB231SDr }, + { X86::VFMSUB231SDZr_Int , X86::VFMSUB231SDr_Int }, + { X86::VFMSUB231SSZm , X86::VFMSUB231SSm }, + { X86::VFMSUB231SSZm_Int , X86::VFMSUB231SSm_Int }, + { X86::VFMSUB231SSZr , X86::VFMSUB231SSr }, + { X86::VFMSUB231SSZr_Int , X86::VFMSUB231SSr_Int }, + { X86::VFNMADD132SDZm , X86::VFNMADD132SDm }, + { X86::VFNMADD132SDZm_Int , X86::VFNMADD132SDm_Int }, + { X86::VFNMADD132SDZr , X86::VFNMADD132SDr }, + { X86::VFNMADD132SDZr_Int , X86::VFNMADD132SDr_Int }, + { X86::VFNMADD132SSZm , X86::VFNMADD132SSm }, + { X86::VFNMADD132SSZm_Int , X86::VFNMADD132SSm_Int }, + { X86::VFNMADD132SSZr , X86::VFNMADD132SSr }, + { X86::VFNMADD132SSZr_Int , X86::VFNMADD132SSr_Int }, + { X86::VFNMADD213SDZm , X86::VFNMADD213SDm }, + { X86::VFNMADD213SDZm_Int , X86::VFNMADD213SDm_Int }, + { X86::VFNMADD213SDZr , X86::VFNMADD213SDr }, + { X86::VFNMADD213SDZr_Int , X86::VFNMADD213SDr_Int }, + { X86::VFNMADD213SSZm , X86::VFNMADD213SSm }, + { X86::VFNMADD213SSZm_Int , X86::VFNMADD213SSm_Int }, + { X86::VFNMADD213SSZr , X86::VFNMADD213SSr }, + { X86::VFNMADD213SSZr_Int , X86::VFNMADD213SSr_Int }, + { X86::VFNMADD231SDZm , X86::VFNMADD231SDm }, + { X86::VFNMADD231SDZm_Int , X86::VFNMADD231SDm_Int }, + { X86::VFNMADD231SDZr , X86::VFNMADD231SDr }, + { X86::VFNMADD231SDZr_Int , X86::VFNMADD231SDr_Int }, + { X86::VFNMADD231SSZm , X86::VFNMADD231SSm }, + { X86::VFNMADD231SSZm_Int , X86::VFNMADD231SSm_Int }, + { X86::VFNMADD231SSZr , X86::VFNMADD231SSr }, + { X86::VFNMADD231SSZr_Int , X86::VFNMADD231SSr_Int }, + { X86::VFNMSUB132SDZm , X86::VFNMSUB132SDm }, + { X86::VFNMSUB132SDZm_Int , X86::VFNMSUB132SDm_Int }, + { X86::VFNMSUB132SDZr , X86::VFNMSUB132SDr }, + { X86::VFNMSUB132SDZr_Int , X86::VFNMSUB132SDr_Int }, + { X86::VFNMSUB132SSZm , X86::VFNMSUB132SSm }, + { X86::VFNMSUB132SSZm_Int , X86::VFNMSUB132SSm_Int }, + { X86::VFNMSUB132SSZr , X86::VFNMSUB132SSr }, + { X86::VFNMSUB132SSZr_Int , X86::VFNMSUB132SSr_Int }, + { X86::VFNMSUB213SDZm , X86::VFNMSUB213SDm }, + { X86::VFNMSUB213SDZm_Int , X86::VFNMSUB213SDm_Int }, + { X86::VFNMSUB213SDZr , X86::VFNMSUB213SDr }, + { X86::VFNMSUB213SDZr_Int , X86::VFNMSUB213SDr_Int }, + { X86::VFNMSUB213SSZm , X86::VFNMSUB213SSm }, + { X86::VFNMSUB213SSZm_Int , X86::VFNMSUB213SSm_Int }, + { X86::VFNMSUB213SSZr , X86::VFNMSUB213SSr }, + { X86::VFNMSUB213SSZr_Int , X86::VFNMSUB213SSr_Int }, + { X86::VFNMSUB231SDZm , X86::VFNMSUB231SDm }, + { X86::VFNMSUB231SDZm_Int , X86::VFNMSUB231SDm_Int }, + { X86::VFNMSUB231SDZr , X86::VFNMSUB231SDr }, + { X86::VFNMSUB231SDZr_Int , X86::VFNMSUB231SDr_Int }, + { X86::VFNMSUB231SSZm , X86::VFNMSUB231SSm }, + { X86::VFNMSUB231SSZm_Int , X86::VFNMSUB231SSm_Int }, + { X86::VFNMSUB231SSZr , X86::VFNMSUB231SSr }, + { X86::VFNMSUB231SSZr_Int , X86::VFNMSUB231SSr_Int }, + { X86::VMAXCSDZrm , X86::VMAXCSDrm }, + { X86::VMAXCSDZrr , X86::VMAXCSDrr }, + { X86::VMAXCSSZrm , X86::VMAXCSSrm }, + { X86::VMAXCSSZrr , X86::VMAXCSSrr }, + { X86::VMAXSDZrm , X86::VMAXSDrm }, + { X86::VMAXSDZrm_Int , X86::VMAXSDrm_Int }, + { X86::VMAXSDZrr , X86::VMAXSDrr }, + { X86::VMAXSDZrr_Int , X86::VMAXSDrr_Int }, + { X86::VMAXSSZrm , X86::VMAXSSrm }, + { X86::VMAXSSZrm_Int , X86::VMAXSSrm_Int }, + { X86::VMAXSSZrr , X86::VMAXSSrr }, + { X86::VMAXSSZrr_Int , X86::VMAXSSrr_Int }, + { X86::VMINCSDZrm , X86::VMINCSDrm }, + { X86::VMINCSDZrr , X86::VMINCSDrr }, + { X86::VMINCSSZrm , X86::VMINCSSrm }, + { X86::VMINCSSZrr , X86::VMINCSSrr }, + { X86::VMINSDZrm , X86::VMINSDrm }, + { X86::VMINSDZrm_Int , X86::VMINSDrm_Int }, + { X86::VMINSDZrr , X86::VMINSDrr }, + { X86::VMINSDZrr_Int , X86::VMINSDrr_Int }, + { X86::VMINSSZrm , X86::VMINSSrm }, + { X86::VMINSSZrm_Int , X86::VMINSSrm_Int }, + { X86::VMINSSZrr , X86::VMINSSrr }, + { X86::VMINSSZrr_Int , X86::VMINSSrr_Int }, + { X86::VMOV64toSDZrr , X86::VMOV64toSDrr }, + { X86::VMOVDI2SSZrm , X86::VMOVDI2SSrm }, + { X86::VMOVDI2SSZrr , X86::VMOVDI2SSrr }, + { X86::VMOVSDZmr , X86::VMOVSDmr }, + { X86::VMOVSDZrm , X86::VMOVSDrm }, + { X86::VMOVSDZrr , X86::VMOVSDrr }, + { X86::VMOVSSZmr , X86::VMOVSSmr }, + { X86::VMOVSSZrm , X86::VMOVSSrm }, + { X86::VMOVSSZrr , X86::VMOVSSrr }, + { X86::VMOVSSZrr_REV , X86::VMOVSSrr_REV }, + { X86::VMULSDZrm , X86::VMULSDrm }, + { X86::VMULSDZrm_Int , X86::VMULSDrm_Int }, + { X86::VMULSDZrr , X86::VMULSDrr }, + { X86::VMULSDZrr_Int , X86::VMULSDrr_Int }, + { X86::VMULSSZrm , X86::VMULSSrm }, + { X86::VMULSSZrm_Int , X86::VMULSSrm_Int }, + { X86::VMULSSZrr , X86::VMULSSrr }, + { X86::VMULSSZrr_Int , X86::VMULSSrr_Int }, + { X86::VSQRTSDZm , X86::VSQRTSDm }, + { X86::VSQRTSDZm_Int , X86::VSQRTSDm_Int }, + { X86::VSQRTSDZr , X86::VSQRTSDr }, + { X86::VSQRTSDZr_Int , X86::VSQRTSDr_Int }, + { X86::VSQRTSSZm , X86::VSQRTSSm }, + { X86::VSQRTSSZm_Int , X86::VSQRTSSm_Int }, + { X86::VSQRTSSZr , X86::VSQRTSSr }, + { X86::VSQRTSSZr_Int , X86::VSQRTSSr_Int }, + { X86::VSUBSDZrm , X86::VSUBSDrm }, + { X86::VSUBSDZrm_Int , X86::VSUBSDrm_Int }, + { X86::VSUBSDZrr , X86::VSUBSDrr }, + { X86::VSUBSDZrr_Int , X86::VSUBSDrr_Int }, + { X86::VSUBSSZrm , X86::VSUBSSrm }, + { X86::VSUBSSZrm_Int , X86::VSUBSSrm_Int }, + { X86::VSUBSSZrr , X86::VSUBSSrr }, + { X86::VSUBSSZrr_Int , X86::VSUBSSrr_Int }, + { X86::VUCOMISDZrm , X86::VUCOMISDrm }, + { X86::VUCOMISDZrr , X86::VUCOMISDrr }, + { X86::VUCOMISSZrm , X86::VUCOMISSrm }, + { X86::VUCOMISSZrr , X86::VUCOMISSrr }, + + { X86::VMOV64toPQIZrr , X86::VMOV64toPQIrr }, + { X86::VMOV64toSDZrr , X86::VMOV64toSDrr }, + { X86::VMOVDI2PDIZrm , X86::VMOVDI2PDIrm }, + { X86::VMOVDI2PDIZrr , X86::VMOVDI2PDIrr }, + { X86::VMOVLHPSZrr , X86::VMOVLHPSrr }, + { X86::VMOVHLPSZrr , X86::VMOVHLPSrr }, + { X86::VMOVPDI2DIZmr , X86::VMOVPDI2DImr }, + { X86::VMOVPDI2DIZrr , X86::VMOVPDI2DIrr }, + { X86::VMOVPQI2QIZmr , X86::VMOVPQI2QImr }, + { X86::VMOVPQIto64Zrr , X86::VMOVPQIto64rr }, + { X86::VMOVQI2PQIZrm , X86::VMOVQI2PQIrm }, + { X86::VMOVZPQILo2PQIZrr , X86::VMOVZPQILo2PQIrr }, + + { X86::VPEXTRBZmr , X86::VPEXTRBmr }, + { X86::VPEXTRBZrr , X86::VPEXTRBrr }, + { X86::VPEXTRDZmr , X86::VPEXTRDmr }, + { X86::VPEXTRDZrr , X86::VPEXTRDrr }, + { X86::VPEXTRQZmr , X86::VPEXTRQmr }, + { X86::VPEXTRQZrr , X86::VPEXTRQrr }, + { X86::VPEXTRWZmr , X86::VPEXTRWmr }, + { X86::VPEXTRWZrr , X86::VPEXTRWri }, + + { X86::VPINSRBZrm , X86::VPINSRBrm }, + { X86::VPINSRBZrr , X86::VPINSRBrr }, + { X86::VPINSRDZrm , X86::VPINSRDrm }, + { X86::VPINSRDZrr , X86::VPINSRDrr }, + { X86::VPINSRQZrm , X86::VPINSRQrm }, + { X86::VPINSRQZrr , X86::VPINSRQrr }, + { X86::VPINSRWZrm , X86::VPINSRWrmi }, + { X86::VPINSRWZrr , X86::VPINSRWrri }, + + // EVEX 128 with corresponding VEX. + { X86::VADDPDZ128rm , X86::VADDPDrm }, + { X86::VADDPDZ128rr , X86::VADDPDrr }, + { X86::VADDPSZ128rm , X86::VADDPSrm }, + { X86::VADDPSZ128rr , X86::VADDPSrr }, + { X86::VANDNPDZ128rm , X86::VANDNPDrm }, + { X86::VANDNPDZ128rr , X86::VANDNPDrr }, + { X86::VANDNPSZ128rm , X86::VANDNPSrm }, + { X86::VANDNPSZ128rr , X86::VANDNPSrr }, + { X86::VANDPDZ128rm , X86::VANDPDrm }, + { X86::VANDPDZ128rr , X86::VANDPDrr }, + { X86::VANDPSZ128rm , X86::VANDPSrm }, + { X86::VANDPSZ128rr , X86::VANDPSrr }, + { X86::VBROADCASTSSZ128m , X86::VBROADCASTSSrm }, + { X86::VBROADCASTSSZ128r , X86::VBROADCASTSSrr }, + { X86::VBROADCASTSSZ128r_s , X86::VBROADCASTSSrr }, + { X86::VCVTDQ2PDZ128rm , X86::VCVTDQ2PDrm }, + { X86::VCVTDQ2PDZ128rr , X86::VCVTDQ2PDrr }, + { X86::VCVTDQ2PSZ128rm , X86::VCVTDQ2PSrm }, + { X86::VCVTDQ2PSZ128rr , X86::VCVTDQ2PSrr }, + { X86::VCVTPD2DQZ128rm , X86::VCVTPD2DQrm }, + { X86::VCVTPD2DQZ128rr , X86::VCVTPD2DQrr }, + { X86::VCVTPD2PSZ128rm , X86::VCVTPD2PSrm }, + { X86::VCVTPD2PSZ128rr , X86::VCVTPD2PSrr }, + { X86::VCVTPH2PSZ128rm , X86::VCVTPH2PSrm }, + { X86::VCVTPH2PSZ128rr , X86::VCVTPH2PSrr }, + { X86::VCVTPS2DQZ128rm , X86::VCVTPS2DQrm }, + { X86::VCVTPS2DQZ128rr , X86::VCVTPS2DQrr }, + { X86::VCVTPS2PDZ128rm , X86::VCVTPS2PDrm }, + { X86::VCVTPS2PDZ128rr , X86::VCVTPS2PDrr }, + { X86::VCVTPS2PHZ128mr , X86::VCVTPS2PHmr }, + { X86::VCVTPS2PHZ128rr , X86::VCVTPS2PHrr }, + { X86::VCVTTPD2DQZ128rm , X86::VCVTTPD2DQrm }, + { X86::VCVTTPD2DQZ128rr , X86::VCVTTPD2DQrr }, + { X86::VCVTTPS2DQZ128rm , X86::VCVTTPS2DQrm }, + { X86::VCVTTPS2DQZ128rr , X86::VCVTTPS2DQrr }, + { X86::VDIVPDZ128rm , X86::VDIVPDrm }, + { X86::VDIVPDZ128rr , X86::VDIVPDrr }, + { X86::VDIVPSZ128rm , X86::VDIVPSrm }, + { X86::VDIVPSZ128rr , X86::VDIVPSrr }, + { X86::VFMADD132PDZ128m , X86::VFMADD132PDm }, + { X86::VFMADD132PDZ128r , X86::VFMADD132PDr }, + { X86::VFMADD132PSZ128m , X86::VFMADD132PSm }, + { X86::VFMADD132PSZ128r , X86::VFMADD132PSr }, + { X86::VFMADD213PDZ128m , X86::VFMADD213PDm }, + { X86::VFMADD213PDZ128r , X86::VFMADD213PDr }, + { X86::VFMADD213PSZ128m , X86::VFMADD213PSm }, + { X86::VFMADD213PSZ128r , X86::VFMADD213PSr }, + { X86::VFMADD231PDZ128m , X86::VFMADD231PDm }, + { X86::VFMADD231PDZ128r , X86::VFMADD231PDr }, + { X86::VFMADD231PSZ128m , X86::VFMADD231PSm }, + { X86::VFMADD231PSZ128r , X86::VFMADD231PSr }, + { X86::VFMADDSUB132PDZ128m , X86::VFMADDSUB132PDm }, + { X86::VFMADDSUB132PDZ128r , X86::VFMADDSUB132PDr }, + { X86::VFMADDSUB132PSZ128m , X86::VFMADDSUB132PSm }, + { X86::VFMADDSUB132PSZ128r , X86::VFMADDSUB132PSr }, + { X86::VFMADDSUB213PDZ128m , X86::VFMADDSUB213PDm }, + { X86::VFMADDSUB213PDZ128r , X86::VFMADDSUB213PDr }, + { X86::VFMADDSUB213PSZ128m , X86::VFMADDSUB213PSm }, + { X86::VFMADDSUB213PSZ128r , X86::VFMADDSUB213PSr }, + { X86::VFMADDSUB231PDZ128m , X86::VFMADDSUB231PDm }, + { X86::VFMADDSUB231PDZ128r , X86::VFMADDSUB231PDr }, + { X86::VFMADDSUB231PSZ128m , X86::VFMADDSUB231PSm }, + { X86::VFMADDSUB231PSZ128r , X86::VFMADDSUB231PSr }, + { X86::VFMSUB132PDZ128m , X86::VFMSUB132PDm }, + { X86::VFMSUB132PDZ128r , X86::VFMSUB132PDr }, + { X86::VFMSUB132PSZ128m , X86::VFMSUB132PSm }, + { X86::VFMSUB132PSZ128r , X86::VFMSUB132PSr }, + { X86::VFMSUB213PDZ128m , X86::VFMSUB213PDm }, + { X86::VFMSUB213PDZ128r , X86::VFMSUB213PDr }, + { X86::VFMSUB213PSZ128m , X86::VFMSUB213PSm }, + { X86::VFMSUB213PSZ128r , X86::VFMSUB213PSr }, + { X86::VFMSUB231PDZ128m , X86::VFMSUB231PDm }, + { X86::VFMSUB231PDZ128r , X86::VFMSUB231PDr }, + { X86::VFMSUB231PSZ128m , X86::VFMSUB231PSm }, + { X86::VFMSUB231PSZ128r , X86::VFMSUB231PSr }, + { X86::VFMSUBADD132PDZ128m , X86::VFMSUBADD132PDm }, + { X86::VFMSUBADD132PDZ128r , X86::VFMSUBADD132PDr }, + { X86::VFMSUBADD132PSZ128m , X86::VFMSUBADD132PSm }, + { X86::VFMSUBADD132PSZ128r , X86::VFMSUBADD132PSr }, + { X86::VFMSUBADD213PDZ128m , X86::VFMSUBADD213PDm }, + { X86::VFMSUBADD213PDZ128r , X86::VFMSUBADD213PDr }, + { X86::VFMSUBADD213PSZ128m , X86::VFMSUBADD213PSm }, + { X86::VFMSUBADD213PSZ128r , X86::VFMSUBADD213PSr }, + { X86::VFMSUBADD231PDZ128m , X86::VFMSUBADD231PDm }, + { X86::VFMSUBADD231PDZ128r , X86::VFMSUBADD231PDr }, + { X86::VFMSUBADD231PSZ128m , X86::VFMSUBADD231PSm }, + { X86::VFMSUBADD231PSZ128r , X86::VFMSUBADD231PSr }, + { X86::VFNMADD132PDZ128m , X86::VFNMADD132PDm }, + { X86::VFNMADD132PDZ128r , X86::VFNMADD132PDr }, + { X86::VFNMADD132PSZ128m , X86::VFNMADD132PSm }, + { X86::VFNMADD132PSZ128r , X86::VFNMADD132PSr }, + { X86::VFNMADD213PDZ128m , X86::VFNMADD213PDm }, + { X86::VFNMADD213PDZ128r , X86::VFNMADD213PDr }, + { X86::VFNMADD213PSZ128m , X86::VFNMADD213PSm }, + { X86::VFNMADD213PSZ128r , X86::VFNMADD213PSr }, + { X86::VFNMADD231PDZ128m , X86::VFNMADD231PDm }, + { X86::VFNMADD231PDZ128r , X86::VFNMADD231PDr }, + { X86::VFNMADD231PSZ128m , X86::VFNMADD231PSm }, + { X86::VFNMADD231PSZ128r , X86::VFNMADD231PSr }, + { X86::VFNMSUB132PDZ128m , X86::VFNMSUB132PDm }, + { X86::VFNMSUB132PDZ128r , X86::VFNMSUB132PDr }, + { X86::VFNMSUB132PSZ128m , X86::VFNMSUB132PSm }, + { X86::VFNMSUB132PSZ128r , X86::VFNMSUB132PSr }, + { X86::VFNMSUB213PDZ128m , X86::VFNMSUB213PDm }, + { X86::VFNMSUB213PDZ128r , X86::VFNMSUB213PDr }, + { X86::VFNMSUB213PSZ128m , X86::VFNMSUB213PSm }, + { X86::VFNMSUB213PSZ128r , X86::VFNMSUB213PSr }, + { X86::VFNMSUB231PDZ128m , X86::VFNMSUB231PDm }, + { X86::VFNMSUB231PDZ128r , X86::VFNMSUB231PDr }, + { X86::VFNMSUB231PSZ128m , X86::VFNMSUB231PSm }, + { X86::VFNMSUB231PSZ128r , X86::VFNMSUB231PSr }, + { X86::VMAXCPDZ128rm , X86::VMAXCPDrm }, + { X86::VMAXCPDZ128rr , X86::VMAXCPDrr }, + { X86::VMAXCPSZ128rm , X86::VMAXCPSrm }, + { X86::VMAXCPSZ128rr , X86::VMAXCPSrr }, + { X86::VMAXPDZ128rm , X86::VMAXPDrm }, + { X86::VMAXPDZ128rr , X86::VMAXPDrr }, + { X86::VMAXPSZ128rm , X86::VMAXPSrm }, + { X86::VMAXPSZ128rr , X86::VMAXPSrr }, + { X86::VMINCPDZ128rm , X86::VMINCPDrm }, + { X86::VMINCPDZ128rr , X86::VMINCPDrr }, + { X86::VMINCPSZ128rm , X86::VMINCPSrm }, + { X86::VMINCPSZ128rr , X86::VMINCPSrr }, + { X86::VMINPDZ128rm , X86::VMINPDrm }, + { X86::VMINPDZ128rr , X86::VMINPDrr }, + { X86::VMINPSZ128rm , X86::VMINPSrm }, + { X86::VMINPSZ128rr , X86::VMINPSrr }, + { X86::VMOVAPDZ128mr , X86::VMOVAPDmr }, + { X86::VMOVAPDZ128rm , X86::VMOVAPDrm }, + { X86::VMOVAPDZ128rr , X86::VMOVAPDrr }, + { X86::VMOVAPDZ128rr_REV , X86::VMOVAPDrr_REV }, + { X86::VMOVAPSZ128mr , X86::VMOVAPSmr }, + { X86::VMOVAPSZ128rm , X86::VMOVAPSrm }, + { X86::VMOVAPSZ128rr , X86::VMOVAPSrr }, + { X86::VMOVAPSZ128rr_REV , X86::VMOVAPSrr_REV }, + { X86::VMOVDDUPZ128rm , X86::VMOVDDUPrm }, + { X86::VMOVDDUPZ128rr , X86::VMOVDDUPrr }, + { X86::VMOVDQA32Z128mr , X86::VMOVDQAmr }, + { X86::VMOVDQA32Z128rm , X86::VMOVDQArm }, + { X86::VMOVDQA32Z128rr , X86::VMOVDQArr }, + { X86::VMOVDQA32Z128rr_REV , X86::VMOVDQArr_REV }, + { X86::VMOVDQA64Z128mr , X86::VMOVDQAmr }, + { X86::VMOVDQA64Z128rm , X86::VMOVDQArm }, + { X86::VMOVDQA64Z128rr , X86::VMOVDQArr }, + { X86::VMOVDQA64Z128rr_REV , X86::VMOVDQArr_REV }, + { X86::VMOVDQU16Z128mr , X86::VMOVDQUmr }, + { X86::VMOVDQU16Z128rm , X86::VMOVDQUrm }, + { X86::VMOVDQU16Z128rr , X86::VMOVDQUrr }, + { X86::VMOVDQU16Z128rr_REV , X86::VMOVDQUrr_REV }, + { X86::VMOVDQU32Z128mr , X86::VMOVDQUmr }, + { X86::VMOVDQU32Z128rm , X86::VMOVDQUrm }, + { X86::VMOVDQU32Z128rr , X86::VMOVDQUrr }, + { X86::VMOVDQU32Z128rr_REV , X86::VMOVDQUrr_REV }, + { X86::VMOVDQU64Z128mr , X86::VMOVDQUmr }, + { X86::VMOVDQU64Z128rm , X86::VMOVDQUrm }, + { X86::VMOVDQU64Z128rr , X86::VMOVDQUrr }, + { X86::VMOVDQU64Z128rr_REV , X86::VMOVDQUrr_REV }, + { X86::VMOVDQU8Z128mr , X86::VMOVDQUmr }, + { X86::VMOVDQU8Z128rm , X86::VMOVDQUrm }, + { X86::VMOVDQU8Z128rr , X86::VMOVDQUrr }, + { X86::VMOVDQU8Z128rr_REV , X86::VMOVDQUrr_REV }, + { X86::VMOVHPDZ128mr , X86::VMOVHPDmr }, + { X86::VMOVHPDZ128rm , X86::VMOVHPDrm }, + { X86::VMOVHPSZ128mr , X86::VMOVHPSmr }, + { X86::VMOVHPSZ128rm , X86::VMOVHPSrm }, + { X86::VMOVLPDZ128mr , X86::VMOVLPDmr }, + { X86::VMOVLPDZ128rm , X86::VMOVLPDrm }, + { X86::VMOVLPSZ128mr , X86::VMOVLPSmr }, + { X86::VMOVLPSZ128rm , X86::VMOVLPSrm }, + { X86::VMOVNTDQAZ128rm , X86::VMOVNTDQArm }, + { X86::VMOVNTDQZ128mr , X86::VMOVNTDQmr }, + { X86::VMOVNTPDZ128mr , X86::VMOVNTPDmr }, + { X86::VMOVNTPSZ128mr , X86::VMOVNTPSmr }, + { X86::VMOVSHDUPZ128rm , X86::VMOVSHDUPrm }, + { X86::VMOVSHDUPZ128rr , X86::VMOVSHDUPrr }, + { X86::VMOVSLDUPZ128rm , X86::VMOVSLDUPrm }, + { X86::VMOVSLDUPZ128rr , X86::VMOVSLDUPrr }, + { X86::VMOVUPDZ128mr , X86::VMOVUPDmr }, + { X86::VMOVUPDZ128rm , X86::VMOVUPDrm }, + { X86::VMOVUPDZ128rr , X86::VMOVUPDrr }, + { X86::VMOVUPDZ128rr_REV , X86::VMOVUPDrr_REV }, + { X86::VMOVUPSZ128mr , X86::VMOVUPSmr }, + { X86::VMOVUPSZ128rm , X86::VMOVUPSrm }, + { X86::VMOVUPSZ128rr , X86::VMOVUPSrr }, + { X86::VMOVUPSZ128rr_REV , X86::VMOVUPSrr_REV }, + { X86::VMULPDZ128rm , X86::VMULPDrm }, + { X86::VMULPDZ128rr , X86::VMULPDrr }, + { X86::VMULPSZ128rm , X86::VMULPSrm }, + { X86::VMULPSZ128rr , X86::VMULPSrr }, + { X86::VORPDZ128rm , X86::VORPDrm }, + { X86::VORPDZ128rr , X86::VORPDrr }, + { X86::VORPSZ128rm , X86::VORPSrm }, + { X86::VORPSZ128rr , X86::VORPSrr }, + { X86::VPABSBZ128rm , X86::VPABSBrm }, + { X86::VPABSBZ128rr , X86::VPABSBrr }, + { X86::VPABSDZ128rm , X86::VPABSDrm }, + { X86::VPABSDZ128rr , X86::VPABSDrr }, + { X86::VPABSWZ128rm , X86::VPABSWrm }, + { X86::VPABSWZ128rr , X86::VPABSWrr }, + { X86::VPACKSSDWZ128rm , X86::VPACKSSDWrm }, + { X86::VPACKSSDWZ128rr , X86::VPACKSSDWrr }, + { X86::VPACKSSWBZ128rm , X86::VPACKSSWBrm }, + { X86::VPACKSSWBZ128rr , X86::VPACKSSWBrr }, + { X86::VPACKUSDWZ128rm , X86::VPACKUSDWrm }, + { X86::VPACKUSDWZ128rr , X86::VPACKUSDWrr }, + { X86::VPACKUSWBZ128rm , X86::VPACKUSWBrm }, + { X86::VPACKUSWBZ128rr , X86::VPACKUSWBrr }, + { X86::VPADDBZ128rm , X86::VPADDBrm }, + { X86::VPADDBZ128rr , X86::VPADDBrr }, + { X86::VPADDDZ128rm , X86::VPADDDrm }, + { X86::VPADDDZ128rr , X86::VPADDDrr }, + { X86::VPADDQZ128rm , X86::VPADDQrm }, + { X86::VPADDQZ128rr , X86::VPADDQrr }, + { X86::VPADDSBZ128rm , X86::VPADDSBrm }, + { X86::VPADDSBZ128rr , X86::VPADDSBrr }, + { X86::VPADDSWZ128rm , X86::VPADDSWrm }, + { X86::VPADDSWZ128rr , X86::VPADDSWrr }, + { X86::VPADDUSBZ128rm , X86::VPADDUSBrm }, + { X86::VPADDUSBZ128rr , X86::VPADDUSBrr }, + { X86::VPADDUSWZ128rm , X86::VPADDUSWrm }, + { X86::VPADDUSWZ128rr , X86::VPADDUSWrr }, + { X86::VPADDWZ128rm , X86::VPADDWrm }, + { X86::VPADDWZ128rr , X86::VPADDWrr }, + { X86::VPALIGNRZ128rmi , X86::VPALIGNRrmi }, + { X86::VPALIGNRZ128rri , X86::VPALIGNRrri }, + { X86::VPANDDZ128rm , X86::VPANDrm }, + { X86::VPANDDZ128rr , X86::VPANDrr }, + { X86::VPANDQZ128rm , X86::VPANDrm }, + { X86::VPANDQZ128rr , X86::VPANDrr }, + { X86::VPAVGBZ128rm , X86::VPAVGBrm }, + { X86::VPAVGBZ128rr , X86::VPAVGBrr }, + { X86::VPAVGWZ128rm , X86::VPAVGWrm }, + { X86::VPAVGWZ128rr , X86::VPAVGWrr }, + { X86::VPBROADCASTBZ128m , X86::VPBROADCASTBrm }, + { X86::VPBROADCASTBZ128r , X86::VPBROADCASTBrr }, + { X86::VPBROADCASTDZ128m , X86::VPBROADCASTDrm }, + { X86::VPBROADCASTDZ128r , X86::VPBROADCASTDrr }, + { X86::VPBROADCASTQZ128m , X86::VPBROADCASTQrm }, + { X86::VPBROADCASTQZ128r , X86::VPBROADCASTQrr }, + { X86::VPBROADCASTWZ128m , X86::VPBROADCASTWrm }, + { X86::VPBROADCASTWZ128r , X86::VPBROADCASTWrr }, + { X86::VPERMILPDZ128mi , X86::VPERMILPDmi }, + { X86::VPERMILPDZ128ri , X86::VPERMILPDri }, + { X86::VPERMILPDZ128rm , X86::VPERMILPDrm }, + { X86::VPERMILPDZ128rr , X86::VPERMILPDrr }, + { X86::VPERMILPSZ128mi , X86::VPERMILPSmi }, + { X86::VPERMILPSZ128ri , X86::VPERMILPSri }, + { X86::VPERMILPSZ128rm , X86::VPERMILPSrm }, + { X86::VPERMILPSZ128rr , X86::VPERMILPSrr }, + { X86::VPMADDUBSWZ128rm , X86::VPMADDUBSWrm }, + { X86::VPMADDUBSWZ128rr , X86::VPMADDUBSWrr }, + { X86::VPMADDWDZ128rm , X86::VPMADDWDrm }, + { X86::VPMADDWDZ128rr , X86::VPMADDWDrr }, + { X86::VPMAXSBZ128rm , X86::VPMAXSBrm }, + { X86::VPMAXSBZ128rr , X86::VPMAXSBrr }, + { X86::VPMAXSDZ128rm , X86::VPMAXSDrm }, + { X86::VPMAXSDZ128rr , X86::VPMAXSDrr }, + { X86::VPMAXSWZ128rm , X86::VPMAXSWrm }, + { X86::VPMAXSWZ128rr , X86::VPMAXSWrr }, + { X86::VPMAXUBZ128rm , X86::VPMAXUBrm }, + { X86::VPMAXUBZ128rr , X86::VPMAXUBrr }, + { X86::VPMAXUDZ128rm , X86::VPMAXUDrm }, + { X86::VPMAXUDZ128rr , X86::VPMAXUDrr }, + { X86::VPMAXUWZ128rm , X86::VPMAXUWrm }, + { X86::VPMAXUWZ128rr , X86::VPMAXUWrr }, + { X86::VPMINSBZ128rm , X86::VPMINSBrm }, + { X86::VPMINSBZ128rr , X86::VPMINSBrr }, + { X86::VPMINSDZ128rm , X86::VPMINSDrm }, + { X86::VPMINSDZ128rr , X86::VPMINSDrr }, + { X86::VPMINSWZ128rm , X86::VPMINSWrm }, + { X86::VPMINSWZ128rr , X86::VPMINSWrr }, + { X86::VPMINUBZ128rm , X86::VPMINUBrm }, + { X86::VPMINUBZ128rr , X86::VPMINUBrr }, + { X86::VPMINUDZ128rm , X86::VPMINUDrm }, + { X86::VPMINUDZ128rr , X86::VPMINUDrr }, + { X86::VPMINUWZ128rm , X86::VPMINUWrm }, + { X86::VPMINUWZ128rr , X86::VPMINUWrr }, + { X86::VPMOVSXBDZ128rm , X86::VPMOVSXBDrm }, + { X86::VPMOVSXBDZ128rr , X86::VPMOVSXBDrr }, + { X86::VPMOVSXBQZ128rm , X86::VPMOVSXBQrm }, + { X86::VPMOVSXBQZ128rr , X86::VPMOVSXBQrr }, + { X86::VPMOVSXBWZ128rm , X86::VPMOVSXBWrm }, + { X86::VPMOVSXBWZ128rr , X86::VPMOVSXBWrr }, + { X86::VPMOVSXDQZ128rm , X86::VPMOVSXDQrm }, + { X86::VPMOVSXDQZ128rr , X86::VPMOVSXDQrr }, + { X86::VPMOVSXWDZ128rm , X86::VPMOVSXWDrm }, + { X86::VPMOVSXWDZ128rr , X86::VPMOVSXWDrr }, + { X86::VPMOVSXWQZ128rm , X86::VPMOVSXWQrm }, + { X86::VPMOVSXWQZ128rr , X86::VPMOVSXWQrr }, + { X86::VPMOVZXBDZ128rm , X86::VPMOVZXBDrm }, + { X86::VPMOVZXBDZ128rr , X86::VPMOVZXBDrr }, + { X86::VPMOVZXBQZ128rm , X86::VPMOVZXBQrm }, + { X86::VPMOVZXBQZ128rr , X86::VPMOVZXBQrr }, + { X86::VPMOVZXBWZ128rm , X86::VPMOVZXBWrm }, + { X86::VPMOVZXBWZ128rr , X86::VPMOVZXBWrr }, + { X86::VPMOVZXDQZ128rm , X86::VPMOVZXDQrm }, + { X86::VPMOVZXDQZ128rr , X86::VPMOVZXDQrr }, + { X86::VPMOVZXWDZ128rm , X86::VPMOVZXWDrm }, + { X86::VPMOVZXWDZ128rr , X86::VPMOVZXWDrr }, + { X86::VPMOVZXWQZ128rm , X86::VPMOVZXWQrm }, + { X86::VPMOVZXWQZ128rr , X86::VPMOVZXWQrr }, + { X86::VPMULDQZ128rm , X86::VPMULDQrm }, + { X86::VPMULDQZ128rr , X86::VPMULDQrr }, + { X86::VPMULHRSWZ128rm , X86::VPMULHRSWrm }, + { X86::VPMULHRSWZ128rr , X86::VPMULHRSWrr }, + { X86::VPMULHUWZ128rm , X86::VPMULHUWrm }, + { X86::VPMULHUWZ128rr , X86::VPMULHUWrr }, + { X86::VPMULHWZ128rm , X86::VPMULHWrm }, + { X86::VPMULHWZ128rr , X86::VPMULHWrr }, + { X86::VPMULLDZ128rm , X86::VPMULLDrm }, + { X86::VPMULLDZ128rr , X86::VPMULLDrr }, + { X86::VPMULLWZ128rm , X86::VPMULLWrm }, + { X86::VPMULLWZ128rr , X86::VPMULLWrr }, + { X86::VPMULUDQZ128rm , X86::VPMULUDQrm }, + { X86::VPMULUDQZ128rr , X86::VPMULUDQrr }, + { X86::VPORDZ128rm , X86::VPORrm }, + { X86::VPORDZ128rr , X86::VPORrr }, + { X86::VPORQZ128rm , X86::VPORrm }, + { X86::VPORQZ128rr , X86::VPORrr }, + { X86::VPSADBWZ128rm , X86::VPSADBWrm }, + { X86::VPSADBWZ128rr , X86::VPSADBWrr }, + { X86::VPSHUFBZ128rm , X86::VPSHUFBrm }, + { X86::VPSHUFBZ128rr , X86::VPSHUFBrr }, + { X86::VPSHUFDZ128mi , X86::VPSHUFDmi }, + { X86::VPSHUFDZ128ri , X86::VPSHUFDri }, + { X86::VPSHUFHWZ128mi , X86::VPSHUFHWmi }, + { X86::VPSHUFHWZ128ri , X86::VPSHUFHWri }, + { X86::VPSHUFLWZ128mi , X86::VPSHUFLWmi }, + { X86::VPSHUFLWZ128ri , X86::VPSHUFLWri }, + { X86::VPSLLDQZ128rr , X86::VPSLLDQri }, + { X86::VPSLLDZ128ri , X86::VPSLLDri }, + { X86::VPSLLDZ128rm , X86::VPSLLDrm }, + { X86::VPSLLDZ128rr , X86::VPSLLDrr }, + { X86::VPSLLQZ128ri , X86::VPSLLQri }, + { X86::VPSLLQZ128rm , X86::VPSLLQrm }, + { X86::VPSLLQZ128rr , X86::VPSLLQrr }, + { X86::VPSLLVDZ128rm , X86::VPSLLVDrm }, + { X86::VPSLLVDZ128rr , X86::VPSLLVDrr }, + { X86::VPSLLVQZ128rm , X86::VPSLLVQrm }, + { X86::VPSLLVQZ128rr , X86::VPSLLVQrr }, + { X86::VPSLLWZ128ri , X86::VPSLLWri }, + { X86::VPSLLWZ128rm , X86::VPSLLWrm }, + { X86::VPSLLWZ128rr , X86::VPSLLWrr }, + { X86::VPSRADZ128ri , X86::VPSRADri }, + { X86::VPSRADZ128rm , X86::VPSRADrm }, + { X86::VPSRADZ128rr , X86::VPSRADrr }, + { X86::VPSRAVDZ128rm , X86::VPSRAVDrm }, + { X86::VPSRAVDZ128rr , X86::VPSRAVDrr }, + { X86::VPSRAWZ128ri , X86::VPSRAWri }, + { X86::VPSRAWZ128rm , X86::VPSRAWrm }, + { X86::VPSRAWZ128rr , X86::VPSRAWrr }, + { X86::VPSRLDQZ128rr , X86::VPSRLDQri }, + { X86::VPSRLDZ128ri , X86::VPSRLDri }, + { X86::VPSRLDZ128rm , X86::VPSRLDrm }, + { X86::VPSRLDZ128rr , X86::VPSRLDrr }, + { X86::VPSRLQZ128ri , X86::VPSRLQri }, + { X86::VPSRLQZ128rm , X86::VPSRLQrm }, + { X86::VPSRLQZ128rr , X86::VPSRLQrr }, + { X86::VPSRLVDZ128rm , X86::VPSRLVDrm }, + { X86::VPSRLVDZ128rr , X86::VPSRLVDrr }, + { X86::VPSRLVQZ128rm , X86::VPSRLVQrm }, + { X86::VPSRLVQZ128rr , X86::VPSRLVQrr }, + { X86::VPSRLWZ128ri , X86::VPSRLWri }, + { X86::VPSRLWZ128rm , X86::VPSRLWrm }, + { X86::VPSRLWZ128rr , X86::VPSRLWrr }, + { X86::VPSUBBZ128rm , X86::VPSUBBrm }, + { X86::VPSUBBZ128rr , X86::VPSUBBrr }, + { X86::VPSUBDZ128rm , X86::VPSUBDrm }, + { X86::VPSUBDZ128rr , X86::VPSUBDrr }, + { X86::VPSUBQZ128rm , X86::VPSUBQrm }, + { X86::VPSUBQZ128rr , X86::VPSUBQrr }, + { X86::VPSUBSBZ128rm , X86::VPSUBSBrm }, + { X86::VPSUBSBZ128rr , X86::VPSUBSBrr }, + { X86::VPSUBSWZ128rm , X86::VPSUBSWrm }, + { X86::VPSUBSWZ128rr , X86::VPSUBSWrr }, + { X86::VPSUBUSBZ128rm , X86::VPSUBUSBrm }, + { X86::VPSUBUSBZ128rr , X86::VPSUBUSBrr }, + { X86::VPSUBUSWZ128rm , X86::VPSUBUSWrm }, + { X86::VPSUBUSWZ128rr , X86::VPSUBUSWrr }, + { X86::VPSUBWZ128rm , X86::VPSUBWrm }, + { X86::VPSUBWZ128rr , X86::VPSUBWrr }, + { X86::VPUNPCKHBWZ128rm , X86::VPUNPCKHBWrm }, + { X86::VPUNPCKHBWZ128rr , X86::VPUNPCKHBWrr }, + { X86::VPUNPCKHDQZ128rm , X86::VPUNPCKHDQrm }, + { X86::VPUNPCKHDQZ128rr , X86::VPUNPCKHDQrr }, + { X86::VPUNPCKHQDQZ128rm , X86::VPUNPCKHQDQrm }, + { X86::VPUNPCKHQDQZ128rr , X86::VPUNPCKHQDQrr }, + { X86::VPUNPCKHWDZ128rm , X86::VPUNPCKHWDrm }, + { X86::VPUNPCKHWDZ128rr , X86::VPUNPCKHWDrr }, + { X86::VPUNPCKLBWZ128rm , X86::VPUNPCKLBWrm }, + { X86::VPUNPCKLBWZ128rr , X86::VPUNPCKLBWrr }, + { X86::VPUNPCKLDQZ128rm , X86::VPUNPCKLDQrm }, + { X86::VPUNPCKLDQZ128rr , X86::VPUNPCKLDQrr }, + { X86::VPUNPCKLQDQZ128rm , X86::VPUNPCKLQDQrm }, + { X86::VPUNPCKLQDQZ128rr , X86::VPUNPCKLQDQrr }, + { X86::VPUNPCKLWDZ128rm , X86::VPUNPCKLWDrm }, + { X86::VPUNPCKLWDZ128rr , X86::VPUNPCKLWDrr }, + { X86::VPXORDZ128rm , X86::VPXORrm }, + { X86::VPXORDZ128rr , X86::VPXORrr }, + { X86::VPXORQZ128rm , X86::VPXORrm }, + { X86::VPXORQZ128rr , X86::VPXORrr }, + { X86::VSHUFPDZ128rmi , X86::VSHUFPDrmi }, + { X86::VSHUFPDZ128rri , X86::VSHUFPDrri }, + { X86::VSHUFPSZ128rmi , X86::VSHUFPSrmi }, + { X86::VSHUFPSZ128rri , X86::VSHUFPSrri }, + { X86::VSQRTPDZ128m , X86::VSQRTPDm }, + { X86::VSQRTPDZ128r , X86::VSQRTPDr }, + { X86::VSQRTPSZ128m , X86::VSQRTPSm }, + { X86::VSQRTPSZ128r , X86::VSQRTPSr }, + { X86::VSUBPDZ128rm , X86::VSUBPDrm }, + { X86::VSUBPDZ128rr , X86::VSUBPDrr }, + { X86::VSUBPSZ128rm , X86::VSUBPSrm }, + { X86::VSUBPSZ128rr , X86::VSUBPSrr }, + { X86::VUNPCKHPDZ128rm , X86::VUNPCKHPDrm }, + { X86::VUNPCKHPDZ128rr , X86::VUNPCKHPDrr }, + { X86::VUNPCKHPSZ128rm , X86::VUNPCKHPSrm }, + { X86::VUNPCKHPSZ128rr , X86::VUNPCKHPSrr }, + { X86::VUNPCKLPDZ128rm , X86::VUNPCKLPDrm }, + { X86::VUNPCKLPDZ128rr , X86::VUNPCKLPDrr }, + { X86::VUNPCKLPSZ128rm , X86::VUNPCKLPSrm }, + { X86::VUNPCKLPSZ128rr , X86::VUNPCKLPSrr }, + { X86::VXORPDZ128rm , X86::VXORPDrm }, + { X86::VXORPDZ128rr , X86::VXORPDrr }, + { X86::VXORPSZ128rm , X86::VXORPSrm }, + { X86::VXORPSZ128rr , X86::VXORPSrr }, +}; + + +// X86 EVEX encoded instructions that have a VEX 256 encoding +// (table format: <EVEX opcode, VEX-256 opcode>). + static const X86EvexToVexCompressTableEntry X86EvexToVex256CompressTable[] = { + { X86::VADDPDZ256rm , X86::VADDPDYrm }, + { X86::VADDPDZ256rr , X86::VADDPDYrr }, + { X86::VADDPSZ256rm , X86::VADDPSYrm }, + { X86::VADDPSZ256rr , X86::VADDPSYrr }, + { X86::VANDNPDZ256rm , X86::VANDNPDYrm }, + { X86::VANDNPDZ256rr , X86::VANDNPDYrr }, + { X86::VANDNPSZ256rm , X86::VANDNPSYrm }, + { X86::VANDNPSZ256rr , X86::VANDNPSYrr }, + { X86::VANDPDZ256rm , X86::VANDPDYrm }, + { X86::VANDPDZ256rr , X86::VANDPDYrr }, + { X86::VANDPSZ256rm , X86::VANDPSYrm }, + { X86::VANDPSZ256rr , X86::VANDPSYrr }, + { X86::VBROADCASTSDZ256m , X86::VBROADCASTSDYrm }, + { X86::VBROADCASTSDZ256r , X86::VBROADCASTSDYrr }, + { X86::VBROADCASTSDZ256r_s , X86::VBROADCASTSDYrr }, + { X86::VBROADCASTSSZ256m , X86::VBROADCASTSSYrm }, + { X86::VBROADCASTSSZ256r , X86::VBROADCASTSSYrr }, + { X86::VBROADCASTSSZ256r_s , X86::VBROADCASTSSYrr }, + { X86::VCVTDQ2PDZ256rm , X86::VCVTDQ2PDYrm }, + { X86::VCVTDQ2PDZ256rr , X86::VCVTDQ2PDYrr }, + { X86::VCVTDQ2PSZ256rm , X86::VCVTDQ2PSYrm }, + { X86::VCVTDQ2PSZ256rr , X86::VCVTDQ2PSYrr }, + { X86::VCVTPD2DQZ256rm , X86::VCVTPD2DQYrm }, + { X86::VCVTPD2DQZ256rr , X86::VCVTPD2DQYrr }, + { X86::VCVTPD2PSZ256rm , X86::VCVTPD2PSYrm }, + { X86::VCVTPD2PSZ256rr , X86::VCVTPD2PSYrr }, + { X86::VCVTPH2PSZ256rm , X86::VCVTPH2PSYrm }, + { X86::VCVTPH2PSZ256rr , X86::VCVTPH2PSYrr }, + { X86::VCVTPS2DQZ256rm , X86::VCVTPS2DQYrm }, + { X86::VCVTPS2DQZ256rr , X86::VCVTPS2DQYrr }, + { X86::VCVTPS2PDZ256rm , X86::VCVTPS2PDYrm }, + { X86::VCVTPS2PDZ256rr , X86::VCVTPS2PDYrr }, + { X86::VCVTPS2PHZ256mr , X86::VCVTPS2PHYmr }, + { X86::VCVTPS2PHZ256rr , X86::VCVTPS2PHYrr }, + { X86::VCVTTPD2DQZ256rm , X86::VCVTTPD2DQYrm }, + { X86::VCVTTPD2DQZ256rr , X86::VCVTTPD2DQYrr }, + { X86::VCVTTPS2DQZ256rm , X86::VCVTTPS2DQYrm }, + { X86::VCVTTPS2DQZ256rr , X86::VCVTTPS2DQYrr }, + { X86::VDIVPDZ256rm , X86::VDIVPDYrm }, + { X86::VDIVPDZ256rr , X86::VDIVPDYrr }, + { X86::VDIVPSZ256rm , X86::VDIVPSYrm }, + { X86::VDIVPSZ256rr , X86::VDIVPSYrr }, + { X86::VEXTRACTF32x4Z256mr , X86::VEXTRACTF128mr }, + { X86::VEXTRACTF64x2Z256mr , X86::VEXTRACTF128mr }, + { X86::VEXTRACTF32x4Z256rr , X86::VEXTRACTF128rr }, + { X86::VEXTRACTF64x2Z256rr , X86::VEXTRACTF128rr }, + { X86::VEXTRACTI32x4Z256mr , X86::VEXTRACTI128mr }, + { X86::VEXTRACTI64x2Z256mr , X86::VEXTRACTI128mr }, + { X86::VEXTRACTI32x4Z256rr , X86::VEXTRACTI128rr }, + { X86::VEXTRACTI64x2Z256rr , X86::VEXTRACTI128rr }, + { X86::VFMADD132PDZ256m , X86::VFMADD132PDYm }, + { X86::VFMADD132PDZ256r , X86::VFMADD132PDYr }, + { X86::VFMADD132PSZ256m , X86::VFMADD132PSYm }, + { X86::VFMADD132PSZ256r , X86::VFMADD132PSYr }, + { X86::VFMADD213PDZ256m , X86::VFMADD213PDYm }, + { X86::VFMADD213PDZ256r , X86::VFMADD213PDYr }, + { X86::VFMADD213PSZ256m , X86::VFMADD213PSYm }, + { X86::VFMADD213PSZ256r , X86::VFMADD213PSYr }, + { X86::VFMADD231PDZ256m , X86::VFMADD231PDYm }, + { X86::VFMADD231PDZ256r , X86::VFMADD231PDYr }, + { X86::VFMADD231PSZ256m , X86::VFMADD231PSYm }, + { X86::VFMADD231PSZ256r , X86::VFMADD231PSYr }, + { X86::VFMADDSUB132PDZ256m , X86::VFMADDSUB132PDYm }, + { X86::VFMADDSUB132PDZ256r , X86::VFMADDSUB132PDYr }, + { X86::VFMADDSUB132PSZ256m , X86::VFMADDSUB132PSYm }, + { X86::VFMADDSUB132PSZ256r , X86::VFMADDSUB132PSYr }, + { X86::VFMADDSUB213PDZ256m , X86::VFMADDSUB213PDYm }, + { X86::VFMADDSUB213PDZ256r , X86::VFMADDSUB213PDYr }, + { X86::VFMADDSUB213PSZ256m , X86::VFMADDSUB213PSYm }, + { X86::VFMADDSUB213PSZ256r , X86::VFMADDSUB213PSYr }, + { X86::VFMADDSUB231PDZ256m , X86::VFMADDSUB231PDYm }, + { X86::VFMADDSUB231PDZ256r , X86::VFMADDSUB231PDYr }, + { X86::VFMADDSUB231PSZ256m , X86::VFMADDSUB231PSYm }, + { X86::VFMADDSUB231PSZ256r , X86::VFMADDSUB231PSYr }, + { X86::VFMSUB132PDZ256m , X86::VFMSUB132PDYm }, + { X86::VFMSUB132PDZ256r , X86::VFMSUB132PDYr }, + { X86::VFMSUB132PSZ256m , X86::VFMSUB132PSYm }, + { X86::VFMSUB132PSZ256r , X86::VFMSUB132PSYr }, + { X86::VFMSUB213PDZ256m , X86::VFMSUB213PDYm }, + { X86::VFMSUB213PDZ256r , X86::VFMSUB213PDYr }, + { X86::VFMSUB213PSZ256m , X86::VFMSUB213PSYm }, + { X86::VFMSUB213PSZ256r , X86::VFMSUB213PSYr }, + { X86::VFMSUB231PDZ256m , X86::VFMSUB231PDYm }, + { X86::VFMSUB231PDZ256r , X86::VFMSUB231PDYr }, + { X86::VFMSUB231PSZ256m , X86::VFMSUB231PSYm }, + { X86::VFMSUB231PSZ256r , X86::VFMSUB231PSYr }, + { X86::VFMSUBADD132PDZ256m , X86::VFMSUBADD132PDYm }, + { X86::VFMSUBADD132PDZ256r , X86::VFMSUBADD132PDYr }, + { X86::VFMSUBADD132PSZ256m , X86::VFMSUBADD132PSYm }, + { X86::VFMSUBADD132PSZ256r , X86::VFMSUBADD132PSYr }, + { X86::VFMSUBADD213PDZ256m , X86::VFMSUBADD213PDYm }, + { X86::VFMSUBADD213PDZ256r , X86::VFMSUBADD213PDYr }, + { X86::VFMSUBADD213PSZ256m , X86::VFMSUBADD213PSYm }, + { X86::VFMSUBADD213PSZ256r , X86::VFMSUBADD213PSYr }, + { X86::VFMSUBADD231PDZ256m , X86::VFMSUBADD231PDYm }, + { X86::VFMSUBADD231PDZ256r , X86::VFMSUBADD231PDYr }, + { X86::VFMSUBADD231PSZ256m , X86::VFMSUBADD231PSYm }, + { X86::VFMSUBADD231PSZ256r , X86::VFMSUBADD231PSYr }, + { X86::VFNMADD132PDZ256m , X86::VFNMADD132PDYm }, + { X86::VFNMADD132PDZ256r , X86::VFNMADD132PDYr }, + { X86::VFNMADD132PSZ256m , X86::VFNMADD132PSYm }, + { X86::VFNMADD132PSZ256r , X86::VFNMADD132PSYr }, + { X86::VFNMADD213PDZ256m , X86::VFNMADD213PDYm }, + { X86::VFNMADD213PDZ256r , X86::VFNMADD213PDYr }, + { X86::VFNMADD213PSZ256m , X86::VFNMADD213PSYm }, + { X86::VFNMADD213PSZ256r , X86::VFNMADD213PSYr }, + { X86::VFNMADD231PDZ256m , X86::VFNMADD231PDYm }, + { X86::VFNMADD231PDZ256r , X86::VFNMADD231PDYr }, + { X86::VFNMADD231PSZ256m , X86::VFNMADD231PSYm }, + { X86::VFNMADD231PSZ256r , X86::VFNMADD231PSYr }, + { X86::VFNMSUB132PDZ256m , X86::VFNMSUB132PDYm }, + { X86::VFNMSUB132PDZ256r , X86::VFNMSUB132PDYr }, + { X86::VFNMSUB132PSZ256m , X86::VFNMSUB132PSYm }, + { X86::VFNMSUB132PSZ256r , X86::VFNMSUB132PSYr }, + { X86::VFNMSUB213PDZ256m , X86::VFNMSUB213PDYm }, + { X86::VFNMSUB213PDZ256r , X86::VFNMSUB213PDYr }, + { X86::VFNMSUB213PSZ256m , X86::VFNMSUB213PSYm }, + { X86::VFNMSUB213PSZ256r , X86::VFNMSUB213PSYr }, + { X86::VFNMSUB231PDZ256m , X86::VFNMSUB231PDYm }, + { X86::VFNMSUB231PDZ256r , X86::VFNMSUB231PDYr }, + { X86::VFNMSUB231PSZ256m , X86::VFNMSUB231PSYm }, + { X86::VFNMSUB231PSZ256r , X86::VFNMSUB231PSYr }, + { X86::VINSERTF32x4Z256rm , X86::VINSERTF128rm }, + { X86::VINSERTF64x2Z256rm , X86::VINSERTF128rm }, + { X86::VINSERTF32x4Z256rr , X86::VINSERTF128rr }, + { X86::VINSERTF64x2Z256rr , X86::VINSERTF128rr }, + { X86::VINSERTI32x4Z256rm , X86::VINSERTI128rm }, + { X86::VINSERTI64x2Z256rm , X86::VINSERTI128rm }, + { X86::VINSERTI32x4Z256rr , X86::VINSERTI128rr }, + { X86::VINSERTI64x2Z256rr , X86::VINSERTI128rr }, + { X86::VMAXCPDZ256rm , X86::VMAXCPDYrm }, + { X86::VMAXCPDZ256rr , X86::VMAXCPDYrr }, + { X86::VMAXCPSZ256rm , X86::VMAXCPSYrm }, + { X86::VMAXCPSZ256rr , X86::VMAXCPSYrr }, + { X86::VMAXPDZ256rm , X86::VMAXPDYrm }, + { X86::VMAXPDZ256rr , X86::VMAXPDYrr }, + { X86::VMAXPSZ256rm , X86::VMAXPSYrm }, + { X86::VMAXPSZ256rr , X86::VMAXPSYrr }, + { X86::VMINCPDZ256rm , X86::VMINCPDYrm }, + { X86::VMINCPDZ256rr , X86::VMINCPDYrr }, + { X86::VMINCPSZ256rm , X86::VMINCPSYrm }, + { X86::VMINCPSZ256rr , X86::VMINCPSYrr }, + { X86::VMINPDZ256rm , X86::VMINPDYrm }, + { X86::VMINPDZ256rr , X86::VMINPDYrr }, + { X86::VMINPSZ256rm , X86::VMINPSYrm }, + { X86::VMINPSZ256rr , X86::VMINPSYrr }, + { X86::VMOVAPDZ256mr , X86::VMOVAPDYmr }, + { X86::VMOVAPDZ256rm , X86::VMOVAPDYrm }, + { X86::VMOVAPDZ256rr , X86::VMOVAPDYrr }, + { X86::VMOVAPDZ256rr_REV , X86::VMOVAPDYrr_REV }, + { X86::VMOVAPSZ256mr , X86::VMOVAPSYmr }, + { X86::VMOVAPSZ256rm , X86::VMOVAPSYrm }, + { X86::VMOVAPSZ256rr , X86::VMOVAPSYrr }, + { X86::VMOVAPSZ256rr_REV , X86::VMOVAPSYrr_REV }, + { X86::VMOVDDUPZ256rm , X86::VMOVDDUPYrm }, + { X86::VMOVDDUPZ256rr , X86::VMOVDDUPYrr }, + { X86::VMOVDQA32Z256mr , X86::VMOVDQAYmr }, + { X86::VMOVDQA32Z256rm , X86::VMOVDQAYrm }, + { X86::VMOVDQA32Z256rr , X86::VMOVDQAYrr }, + { X86::VMOVDQA32Z256rr_REV , X86::VMOVDQAYrr_REV }, + { X86::VMOVDQA64Z256mr , X86::VMOVDQAYmr }, + { X86::VMOVDQA64Z256rm , X86::VMOVDQAYrm }, + { X86::VMOVDQA64Z256rr , X86::VMOVDQAYrr }, + { X86::VMOVDQA64Z256rr_REV , X86::VMOVDQAYrr_REV }, + { X86::VMOVDQU16Z256mr , X86::VMOVDQUYmr }, + { X86::VMOVDQU16Z256rm , X86::VMOVDQUYrm }, + { X86::VMOVDQU16Z256rr , X86::VMOVDQUYrr }, + { X86::VMOVDQU16Z256rr_REV , X86::VMOVDQUYrr_REV }, + { X86::VMOVDQU32Z256mr , X86::VMOVDQUYmr }, + { X86::VMOVDQU32Z256rm , X86::VMOVDQUYrm }, + { X86::VMOVDQU32Z256rr , X86::VMOVDQUYrr }, + { X86::VMOVDQU32Z256rr_REV , X86::VMOVDQUYrr_REV }, + { X86::VMOVDQU64Z256mr , X86::VMOVDQUYmr }, + { X86::VMOVDQU64Z256rm , X86::VMOVDQUYrm }, + { X86::VMOVDQU64Z256rr , X86::VMOVDQUYrr }, + { X86::VMOVDQU64Z256rr_REV , X86::VMOVDQUYrr_REV }, + { X86::VMOVDQU8Z256mr , X86::VMOVDQUYmr }, + { X86::VMOVDQU8Z256rm , X86::VMOVDQUYrm }, + { X86::VMOVDQU8Z256rr , X86::VMOVDQUYrr }, + { X86::VMOVDQU8Z256rr_REV , X86::VMOVDQUYrr_REV }, + { X86::VMOVNTDQAZ256rm , X86::VMOVNTDQAYrm }, + { X86::VMOVNTDQZ256mr , X86::VMOVNTDQYmr }, + { X86::VMOVNTPDZ256mr , X86::VMOVNTPDYmr }, + { X86::VMOVNTPSZ256mr , X86::VMOVNTPSYmr }, + { X86::VMOVSHDUPZ256rm , X86::VMOVSHDUPYrm }, + { X86::VMOVSHDUPZ256rr , X86::VMOVSHDUPYrr }, + { X86::VMOVSLDUPZ256rm , X86::VMOVSLDUPYrm }, + { X86::VMOVSLDUPZ256rr , X86::VMOVSLDUPYrr }, + { X86::VMOVUPDZ256mr , X86::VMOVUPDYmr }, + { X86::VMOVUPDZ256rm , X86::VMOVUPDYrm }, + { X86::VMOVUPDZ256rr , X86::VMOVUPDYrr }, + { X86::VMOVUPDZ256rr_REV , X86::VMOVUPDYrr_REV }, + { X86::VMOVUPSZ256mr , X86::VMOVUPSYmr }, + { X86::VMOVUPSZ256rm , X86::VMOVUPSYrm }, + { X86::VMOVUPSZ256rr , X86::VMOVUPSYrr }, + { X86::VMOVUPSZ256rr_REV , X86::VMOVUPSYrr_REV }, + { X86::VMULPDZ256rm , X86::VMULPDYrm }, + { X86::VMULPDZ256rr , X86::VMULPDYrr }, + { X86::VMULPSZ256rm , X86::VMULPSYrm }, + { X86::VMULPSZ256rr , X86::VMULPSYrr }, + { X86::VORPDZ256rm , X86::VORPDYrm }, + { X86::VORPDZ256rr , X86::VORPDYrr }, + { X86::VORPSZ256rm , X86::VORPSYrm }, + { X86::VORPSZ256rr , X86::VORPSYrr }, + { X86::VPABSBZ256rm , X86::VPABSBYrm }, + { X86::VPABSBZ256rr , X86::VPABSBYrr }, + { X86::VPABSDZ256rm , X86::VPABSDYrm }, + { X86::VPABSDZ256rr , X86::VPABSDYrr }, + { X86::VPABSWZ256rm , X86::VPABSWYrm }, + { X86::VPABSWZ256rr , X86::VPABSWYrr }, + { X86::VPACKSSDWZ256rm , X86::VPACKSSDWYrm }, + { X86::VPACKSSDWZ256rr , X86::VPACKSSDWYrr }, + { X86::VPACKSSWBZ256rm , X86::VPACKSSWBYrm }, + { X86::VPACKSSWBZ256rr , X86::VPACKSSWBYrr }, + { X86::VPACKUSDWZ256rm , X86::VPACKUSDWYrm }, + { X86::VPACKUSDWZ256rr , X86::VPACKUSDWYrr }, + { X86::VPACKUSWBZ256rm , X86::VPACKUSWBYrm }, + { X86::VPACKUSWBZ256rr , X86::VPACKUSWBYrr }, + { X86::VPADDBZ256rm , X86::VPADDBYrm }, + { X86::VPADDBZ256rr , X86::VPADDBYrr }, + { X86::VPADDDZ256rm , X86::VPADDDYrm }, + { X86::VPADDDZ256rr , X86::VPADDDYrr }, + { X86::VPADDQZ256rm , X86::VPADDQYrm }, + { X86::VPADDQZ256rr , X86::VPADDQYrr }, + { X86::VPADDSBZ256rm , X86::VPADDSBYrm }, + { X86::VPADDSBZ256rr , X86::VPADDSBYrr }, + { X86::VPADDSWZ256rm , X86::VPADDSWYrm }, + { X86::VPADDSWZ256rr , X86::VPADDSWYrr }, + { X86::VPADDUSBZ256rm , X86::VPADDUSBYrm }, + { X86::VPADDUSBZ256rr , X86::VPADDUSBYrr }, + { X86::VPADDUSWZ256rm , X86::VPADDUSWYrm }, + { X86::VPADDUSWZ256rr , X86::VPADDUSWYrr }, + { X86::VPADDWZ256rm , X86::VPADDWYrm }, + { X86::VPADDWZ256rr , X86::VPADDWYrr }, + { X86::VPALIGNRZ256rmi , X86::VPALIGNRYrmi }, + { X86::VPALIGNRZ256rri , X86::VPALIGNRYrri }, + { X86::VPANDDZ256rm , X86::VPANDYrm }, + { X86::VPANDDZ256rr , X86::VPANDYrr }, + { X86::VPANDQZ256rm , X86::VPANDYrm }, + { X86::VPANDQZ256rr , X86::VPANDYrr }, + { X86::VPAVGBZ256rm , X86::VPAVGBYrm }, + { X86::VPAVGBZ256rr , X86::VPAVGBYrr }, + { X86::VPAVGWZ256rm , X86::VPAVGWYrm }, + { X86::VPAVGWZ256rr , X86::VPAVGWYrr }, + { X86::VPBROADCASTBZ256m , X86::VPBROADCASTBYrm }, + { X86::VPBROADCASTBZ256r , X86::VPBROADCASTBYrr }, + { X86::VPBROADCASTDZ256m , X86::VPBROADCASTDYrm }, + { X86::VPBROADCASTDZ256r , X86::VPBROADCASTDYrr }, + { X86::VPBROADCASTQZ256m , X86::VPBROADCASTQYrm }, + { X86::VPBROADCASTQZ256r , X86::VPBROADCASTQYrr }, + { X86::VPBROADCASTWZ256m , X86::VPBROADCASTWYrm }, + { X86::VPBROADCASTWZ256r , X86::VPBROADCASTWYrr }, + { X86::VPERMDZ256rm , X86::VPERMDYrm }, + { X86::VPERMDZ256rr , X86::VPERMDYrr }, + { X86::VPERMILPDZ256mi , X86::VPERMILPDYmi }, + { X86::VPERMILPDZ256ri , X86::VPERMILPDYri }, + { X86::VPERMILPDZ256rm , X86::VPERMILPDYrm }, + { X86::VPERMILPDZ256rr , X86::VPERMILPDYrr }, + { X86::VPERMILPSZ256mi , X86::VPERMILPSYmi }, + { X86::VPERMILPSZ256ri , X86::VPERMILPSYri }, + { X86::VPERMILPSZ256rm , X86::VPERMILPSYrm }, + { X86::VPERMILPSZ256rr , X86::VPERMILPSYrr }, + { X86::VPERMPDZ256mi , X86::VPERMPDYmi }, + { X86::VPERMPDZ256ri , X86::VPERMPDYri }, + { X86::VPERMPSZ256rm , X86::VPERMPSYrm }, + { X86::VPERMPSZ256rr , X86::VPERMPSYrr }, + { X86::VPERMQZ256mi , X86::VPERMQYmi }, + { X86::VPERMQZ256ri , X86::VPERMQYri }, + { X86::VPMADDUBSWZ256rm , X86::VPMADDUBSWYrm }, + { X86::VPMADDUBSWZ256rr , X86::VPMADDUBSWYrr }, + { X86::VPMADDWDZ256rm , X86::VPMADDWDYrm }, + { X86::VPMADDWDZ256rr , X86::VPMADDWDYrr }, + { X86::VPMAXSBZ256rm , X86::VPMAXSBYrm }, + { X86::VPMAXSBZ256rr , X86::VPMAXSBYrr }, + { X86::VPMAXSDZ256rm , X86::VPMAXSDYrm }, + { X86::VPMAXSDZ256rr , X86::VPMAXSDYrr }, + { X86::VPMAXSWZ256rm , X86::VPMAXSWYrm }, + { X86::VPMAXSWZ256rr , X86::VPMAXSWYrr }, + { X86::VPMAXUBZ256rm , X86::VPMAXUBYrm }, + { X86::VPMAXUBZ256rr , X86::VPMAXUBYrr }, + { X86::VPMAXUDZ256rm , X86::VPMAXUDYrm }, + { X86::VPMAXUDZ256rr , X86::VPMAXUDYrr }, + { X86::VPMAXUWZ256rm , X86::VPMAXUWYrm }, + { X86::VPMAXUWZ256rr , X86::VPMAXUWYrr }, + { X86::VPMINSBZ256rm , X86::VPMINSBYrm }, + { X86::VPMINSBZ256rr , X86::VPMINSBYrr }, + { X86::VPMINSDZ256rm , X86::VPMINSDYrm }, + { X86::VPMINSDZ256rr , X86::VPMINSDYrr }, + { X86::VPMINSWZ256rm , X86::VPMINSWYrm }, + { X86::VPMINSWZ256rr , X86::VPMINSWYrr }, + { X86::VPMINUBZ256rm , X86::VPMINUBYrm }, + { X86::VPMINUBZ256rr , X86::VPMINUBYrr }, + { X86::VPMINUDZ256rm , X86::VPMINUDYrm }, + { X86::VPMINUDZ256rr , X86::VPMINUDYrr }, + { X86::VPMINUWZ256rm , X86::VPMINUWYrm }, + { X86::VPMINUWZ256rr , X86::VPMINUWYrr }, + { X86::VPMOVSXBDZ256rm , X86::VPMOVSXBDYrm }, + { X86::VPMOVSXBDZ256rr , X86::VPMOVSXBDYrr }, + { X86::VPMOVSXBQZ256rm , X86::VPMOVSXBQYrm }, + { X86::VPMOVSXBQZ256rr , X86::VPMOVSXBQYrr }, + { X86::VPMOVSXBWZ256rm , X86::VPMOVSXBWYrm }, + { X86::VPMOVSXBWZ256rr , X86::VPMOVSXBWYrr }, + { X86::VPMOVSXDQZ256rm , X86::VPMOVSXDQYrm }, + { X86::VPMOVSXDQZ256rr , X86::VPMOVSXDQYrr }, + { X86::VPMOVSXWDZ256rm , X86::VPMOVSXWDYrm }, + { X86::VPMOVSXWDZ256rr , X86::VPMOVSXWDYrr }, + { X86::VPMOVSXWQZ256rm , X86::VPMOVSXWQYrm }, + { X86::VPMOVSXWQZ256rr , X86::VPMOVSXWQYrr }, + { X86::VPMOVZXBDZ256rm , X86::VPMOVZXBDYrm }, + { X86::VPMOVZXBDZ256rr , X86::VPMOVZXBDYrr }, + { X86::VPMOVZXBQZ256rm , X86::VPMOVZXBQYrm }, + { X86::VPMOVZXBQZ256rr , X86::VPMOVZXBQYrr }, + { X86::VPMOVZXBWZ256rm , X86::VPMOVZXBWYrm }, + { X86::VPMOVZXBWZ256rr , X86::VPMOVZXBWYrr }, + { X86::VPMOVZXDQZ256rm , X86::VPMOVZXDQYrm }, + { X86::VPMOVZXDQZ256rr , X86::VPMOVZXDQYrr }, + { X86::VPMOVZXWDZ256rm , X86::VPMOVZXWDYrm }, + { X86::VPMOVZXWDZ256rr , X86::VPMOVZXWDYrr }, + { X86::VPMOVZXWQZ256rm , X86::VPMOVZXWQYrm }, + { X86::VPMOVZXWQZ256rr , X86::VPMOVZXWQYrr }, + { X86::VPMULDQZ256rm , X86::VPMULDQYrm }, + { X86::VPMULDQZ256rr , X86::VPMULDQYrr }, + { X86::VPMULHRSWZ256rm , X86::VPMULHRSWYrm }, + { X86::VPMULHRSWZ256rr , X86::VPMULHRSWYrr }, + { X86::VPMULHUWZ256rm , X86::VPMULHUWYrm }, + { X86::VPMULHUWZ256rr , X86::VPMULHUWYrr }, + { X86::VPMULHWZ256rm , X86::VPMULHWYrm }, + { X86::VPMULHWZ256rr , X86::VPMULHWYrr }, + { X86::VPMULLDZ256rm , X86::VPMULLDYrm }, + { X86::VPMULLDZ256rr , X86::VPMULLDYrr }, + { X86::VPMULLWZ256rm , X86::VPMULLWYrm }, + { X86::VPMULLWZ256rr , X86::VPMULLWYrr }, + { X86::VPMULUDQZ256rm , X86::VPMULUDQYrm }, + { X86::VPMULUDQZ256rr , X86::VPMULUDQYrr }, + { X86::VPORDZ256rm , X86::VPORYrm }, + { X86::VPORDZ256rr , X86::VPORYrr }, + { X86::VPORQZ256rm , X86::VPORYrm }, + { X86::VPORQZ256rr , X86::VPORYrr }, + { X86::VPSADBWZ256rm , X86::VPSADBWYrm }, + { X86::VPSADBWZ256rr , X86::VPSADBWYrr }, + { X86::VPSHUFBZ256rm , X86::VPSHUFBYrm }, + { X86::VPSHUFBZ256rr , X86::VPSHUFBYrr }, + { X86::VPSHUFDZ256mi , X86::VPSHUFDYmi }, + { X86::VPSHUFDZ256ri , X86::VPSHUFDYri }, + { X86::VPSHUFHWZ256mi , X86::VPSHUFHWYmi }, + { X86::VPSHUFHWZ256ri , X86::VPSHUFHWYri }, + { X86::VPSHUFLWZ256mi , X86::VPSHUFLWYmi }, + { X86::VPSHUFLWZ256ri , X86::VPSHUFLWYri }, + { X86::VPSLLDQZ256rr , X86::VPSLLDQYri }, + { X86::VPSLLDZ256ri , X86::VPSLLDYri }, + { X86::VPSLLDZ256rm , X86::VPSLLDYrm }, + { X86::VPSLLDZ256rr , X86::VPSLLDYrr }, + { X86::VPSLLQZ256ri , X86::VPSLLQYri }, + { X86::VPSLLQZ256rm , X86::VPSLLQYrm }, + { X86::VPSLLQZ256rr , X86::VPSLLQYrr }, + { X86::VPSLLVDZ256rm , X86::VPSLLVDYrm }, + { X86::VPSLLVDZ256rr , X86::VPSLLVDYrr }, + { X86::VPSLLVQZ256rm , X86::VPSLLVQYrm }, + { X86::VPSLLVQZ256rr , X86::VPSLLVQYrr }, + { X86::VPSLLWZ256ri , X86::VPSLLWYri }, + { X86::VPSLLWZ256rm , X86::VPSLLWYrm }, + { X86::VPSLLWZ256rr , X86::VPSLLWYrr }, + { X86::VPSRADZ256ri , X86::VPSRADYri }, + { X86::VPSRADZ256rm , X86::VPSRADYrm }, + { X86::VPSRADZ256rr , X86::VPSRADYrr }, + { X86::VPSRAVDZ256rm , X86::VPSRAVDYrm }, + { X86::VPSRAVDZ256rr , X86::VPSRAVDYrr }, + { X86::VPSRAWZ256ri , X86::VPSRAWYri }, + { X86::VPSRAWZ256rm , X86::VPSRAWYrm }, + { X86::VPSRAWZ256rr , X86::VPSRAWYrr }, + { X86::VPSRLDQZ256rr , X86::VPSRLDQYri }, + { X86::VPSRLDZ256ri , X86::VPSRLDYri }, + { X86::VPSRLDZ256rm , X86::VPSRLDYrm }, + { X86::VPSRLDZ256rr , X86::VPSRLDYrr }, + { X86::VPSRLQZ256ri , X86::VPSRLQYri }, + { X86::VPSRLQZ256rm , X86::VPSRLQYrm }, + { X86::VPSRLQZ256rr , X86::VPSRLQYrr }, + { X86::VPSRLVDZ256rm , X86::VPSRLVDYrm }, + { X86::VPSRLVDZ256rr , X86::VPSRLVDYrr }, + { X86::VPSRLVQZ256rm , X86::VPSRLVQYrm }, + { X86::VPSRLVQZ256rr , X86::VPSRLVQYrr }, + { X86::VPSRLWZ256ri , X86::VPSRLWYri }, + { X86::VPSRLWZ256rm , X86::VPSRLWYrm }, + { X86::VPSRLWZ256rr , X86::VPSRLWYrr }, + { X86::VPSUBBZ256rm , X86::VPSUBBYrm }, + { X86::VPSUBBZ256rr , X86::VPSUBBYrr }, + { X86::VPSUBDZ256rm , X86::VPSUBDYrm }, + { X86::VPSUBDZ256rr , X86::VPSUBDYrr }, + { X86::VPSUBQZ256rm , X86::VPSUBQYrm }, + { X86::VPSUBQZ256rr , X86::VPSUBQYrr }, + { X86::VPSUBSBZ256rm , X86::VPSUBSBYrm }, + { X86::VPSUBSBZ256rr , X86::VPSUBSBYrr }, + { X86::VPSUBSWZ256rm , X86::VPSUBSWYrm }, + { X86::VPSUBSWZ256rr , X86::VPSUBSWYrr }, + { X86::VPSUBUSBZ256rm , X86::VPSUBUSBYrm }, + { X86::VPSUBUSBZ256rr , X86::VPSUBUSBYrr }, + { X86::VPSUBUSWZ256rm , X86::VPSUBUSWYrm }, + { X86::VPSUBUSWZ256rr , X86::VPSUBUSWYrr }, + { X86::VPSUBWZ256rm , X86::VPSUBWYrm }, + { X86::VPSUBWZ256rr , X86::VPSUBWYrr }, + { X86::VPUNPCKHBWZ256rm , X86::VPUNPCKHBWYrm }, + { X86::VPUNPCKHBWZ256rr , X86::VPUNPCKHBWYrr }, + { X86::VPUNPCKHDQZ256rm , X86::VPUNPCKHDQYrm }, + { X86::VPUNPCKHDQZ256rr , X86::VPUNPCKHDQYrr }, + { X86::VPUNPCKHQDQZ256rm , X86::VPUNPCKHQDQYrm }, + { X86::VPUNPCKHQDQZ256rr , X86::VPUNPCKHQDQYrr }, + { X86::VPUNPCKHWDZ256rm , X86::VPUNPCKHWDYrm }, + { X86::VPUNPCKHWDZ256rr , X86::VPUNPCKHWDYrr }, + { X86::VPUNPCKLBWZ256rm , X86::VPUNPCKLBWYrm }, + { X86::VPUNPCKLBWZ256rr , X86::VPUNPCKLBWYrr }, + { X86::VPUNPCKLDQZ256rm , X86::VPUNPCKLDQYrm }, + { X86::VPUNPCKLDQZ256rr , X86::VPUNPCKLDQYrr }, + { X86::VPUNPCKLQDQZ256rm , X86::VPUNPCKLQDQYrm }, + { X86::VPUNPCKLQDQZ256rr , X86::VPUNPCKLQDQYrr }, + { X86::VPUNPCKLWDZ256rm , X86::VPUNPCKLWDYrm }, + { X86::VPUNPCKLWDZ256rr , X86::VPUNPCKLWDYrr }, + { X86::VPXORDZ256rm , X86::VPXORYrm }, + { X86::VPXORDZ256rr , X86::VPXORYrr }, + { X86::VPXORQZ256rm , X86::VPXORYrm }, + { X86::VPXORQZ256rr , X86::VPXORYrr }, + { X86::VSHUFPDZ256rmi , X86::VSHUFPDYrmi }, + { X86::VSHUFPDZ256rri , X86::VSHUFPDYrri }, + { X86::VSHUFPSZ256rmi , X86::VSHUFPSYrmi }, + { X86::VSHUFPSZ256rri , X86::VSHUFPSYrri }, + { X86::VSQRTPDZ256m , X86::VSQRTPDYm }, + { X86::VSQRTPDZ256r , X86::VSQRTPDYr }, + { X86::VSQRTPSZ256m , X86::VSQRTPSYm }, + { X86::VSQRTPSZ256r , X86::VSQRTPSYr }, + { X86::VSUBPDZ256rm , X86::VSUBPDYrm }, + { X86::VSUBPDZ256rr , X86::VSUBPDYrr }, + { X86::VSUBPSZ256rm , X86::VSUBPSYrm }, + { X86::VSUBPSZ256rr , X86::VSUBPSYrr }, + { X86::VUNPCKHPDZ256rm , X86::VUNPCKHPDYrm }, + { X86::VUNPCKHPDZ256rr , X86::VUNPCKHPDYrr }, + { X86::VUNPCKHPSZ256rm , X86::VUNPCKHPSYrm }, + { X86::VUNPCKHPSZ256rr , X86::VUNPCKHPSYrr }, + { X86::VUNPCKLPDZ256rm , X86::VUNPCKLPDYrm }, + { X86::VUNPCKLPDZ256rr , X86::VUNPCKLPDYrr }, + { X86::VUNPCKLPSZ256rm , X86::VUNPCKLPSYrm }, + { X86::VUNPCKLPSZ256rr , X86::VUNPCKLPSYrr }, + { X86::VXORPDZ256rm , X86::VXORPDYrm }, + { X86::VXORPDZ256rr , X86::VXORPDYrr }, + { X86::VXORPSZ256rm , X86::VXORPSYrm }, + { X86::VXORPSZ256rr , X86::VXORPSYrr }, +}; + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td index f49917b..2b296e1 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -85,12 +85,12 @@ let ExeDomain = SSEPackedDouble in { multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType vt128> { - def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), + def rr : IXOP<opc, MRMSrcReg4VOp3, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>, - XOP_4VOp3, Sched<[WriteVarVecShift]>; + XOP, Sched<[WriteVarVecShift]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -98,13 +98,20 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, (vt128 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>; - def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst), (ins i128mem:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), (vt128 VR128:$src2))))]>, - XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>; + XOP, Sched<[WriteVarVecShift, ReadAfterLd]>; + // For disassembler + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, + XOP_4V, VEX_W, Sched<[WriteVarVecShift]>; } let ExeDomain = SSEPackedInt in { @@ -146,19 +153,19 @@ let ExeDomain = SSEPackedInt in { // Instruction where second source can be memory, but third must be register multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { let isCommutable = 1 in - def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + def rr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, VEX_I8IMM; - def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V; + def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), - VR128:$src3))]>, XOP_4V, VEX_I8IMM; + VR128:$src3))]>, XOP_4V; } let ExeDomain = SSEPackedInt in { @@ -224,37 +231,37 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType vt128> { - def rrr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), (vt128 VR128:$src3))))]>, - XOP_4V, VEX_I8IMM; - def rrm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + XOP_4V; + def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i128mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>, - XOP_4V, VEX_I8IMM, VEX_W, MemOp4; - def rmr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + XOP_4V, VEX_W; + def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))), (vt128 VR128:$src3))))]>, - XOP_4V, VEX_I8IMM; + XOP_4V; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrr_REV : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4; + []>, XOP_4V, VEX_W; } let ExeDomain = SSEPackedInt in { @@ -265,66 +272,66 @@ let ExeDomain = SSEPackedInt in { multiclass xop4op_int<bits<8> opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256> { // 128-bit Instruction - def rrr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, - XOP_4V, VEX_I8IMM; - def rrm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + XOP_4V; + def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i128mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, (bitconvert (loadv2i64 addr:$src3))))]>, - XOP_4V, VEX_I8IMM, VEX_W, MemOp4; - def rmr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + XOP_4V, VEX_W; + def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), VR128:$src3))]>, - XOP_4V, VEX_I8IMM; + XOP_4V; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrr_REV : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4; + []>, XOP_4V, VEX_W; // 256-bit Instruction - def rrrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), + def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, - XOP_4V, VEX_I8IMM, VEX_L; - def rrmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + XOP_4V, VEX_L; + def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, (bitconvert (loadv4i64 addr:$src3))))]>, - XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L; - def rmrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + XOP_4V, VEX_W, VEX_L; + def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)), VR256:$src3))]>, - XOP_4V, VEX_I8IMM, VEX_L; + XOP_4V, VEX_L; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrrY_REV : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), + def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L; + []>, XOP_4V, VEX_W, VEX_L; } let ExeDomain = SSEPackedInt in { @@ -353,7 +360,7 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), (id128 VR128:$src3), (i8 imm:$src4))))]>; - def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + def rm : IXOP5<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), @@ -361,7 +368,7 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), (id128 (bitconvert (loadv2i64 addr:$src3))), (i8 imm:$src4))))]>, - VEX_W, MemOp4; + VEX_W; def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, @@ -372,11 +379,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode, (id128 VR128:$src3), (i8 imm:$src4))))]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rr_REV : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), + def rr_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - []>, VEX_W, MemOp4; + []>, VEX_W; def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4), @@ -385,14 +392,14 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set VR256:$dst, (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2), (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L; - def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + def rmY : IXOP5<opc, MRMSrcMemOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2), (id256 (bitconvert (loadv4i64 addr:$src3))), - (i8 imm:$src4))))]>, VEX_W, MemOp4, VEX_L; + (i8 imm:$src4))))]>, VEX_W, VEX_L; def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, @@ -403,11 +410,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode, (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrY_REV : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), + def rrY_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - []>, VEX_W, MemOp4, VEX_L; + []>, VEX_W, VEX_L; } let ExeDomain = SSEPackedDouble in diff --git a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp new file mode 100644 index 0000000..d9edf46 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -0,0 +1,221 @@ +//===--------- X86InterleavedAccess.cpp ----------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file contains the X86 implementation of the interleaved accesses +/// optimization generating X86-specific instructions/intrinsics for +/// interleaved access groups. +/// +//===--------------------------------------------------------------------===// + +#include "X86ISelLowering.h" +#include "X86TargetMachine.h" + +using namespace llvm; + +/// \brief This class holds necessary information to represent an interleaved +/// access group and supports utilities to lower the group into +/// X86-specific instructions/intrinsics. +/// E.g. A group of interleaving access loads (Factor = 2; accessing every +/// other element) +/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr +/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6> +/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7> + +class X86InterleavedAccessGroup { + /// \brief Reference to the wide-load instruction of an interleaved access + /// group. + Instruction *const Inst; + + /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'. + ArrayRef<ShuffleVectorInst *> Shuffles; + + /// \brief Reference to the starting index of each user-shuffle. + ArrayRef<unsigned> Indices; + + /// \brief Reference to the interleaving stride in terms of elements. + const unsigned Factor; + + /// \brief Reference to the underlying target. + const X86Subtarget &Subtarget; + + const DataLayout &DL; + + IRBuilder<> &Builder; + + /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors + /// sub vectors of type \p T. Returns true and the sub-vectors in + /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise. + bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T, + SmallVectorImpl<Instruction *> &DecomposedVectors); + + /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and + /// returns the transposed-vectors in \p TransposedVectors. + /// E.g. + /// InputVectors: + /// In-V0 = p1, p2, p3, p4 + /// In-V1 = q1, q2, q3, q4 + /// In-V2 = r1, r2, r3, r4 + /// In-V3 = s1, s2, s3, s4 + /// OutputVectors: + /// Out-V0 = p1, q1, r1, s1 + /// Out-V1 = p2, q2, r2, s2 + /// Out-V2 = p3, q3, r3, s3 + /// Out-V3 = P4, q4, r4, s4 + void transpose_4x4(ArrayRef<Instruction *> InputVectors, + SmallVectorImpl<Value *> &TrasposedVectors); + +public: + /// In order to form an interleaved access group X86InterleavedAccessGroup + /// requires a wide-load instruction \p 'I', a group of interleaved-vectors + /// \p Shuffs, reference to the first indices of each interleaved-vector + /// \p 'Ind' and the interleaving stride factor \p F. In order to generate + /// X86-specific instructions/intrinsics it also requires the underlying + /// target information \p STarget. + explicit X86InterleavedAccessGroup(Instruction *I, + ArrayRef<ShuffleVectorInst *> Shuffs, + ArrayRef<unsigned> Ind, + const unsigned F, + const X86Subtarget &STarget, + IRBuilder<> &B) + : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget), + DL(Inst->getModule()->getDataLayout()), Builder(B) {} + + /// \brief Returns true if this interleaved access group can be lowered into + /// x86-specific instructions/intrinsics, false otherwise. + bool isSupported() const; + + /// \brief Lowers this interleaved access group into X86-specific + /// instructions/intrinsics. + bool lowerIntoOptimizedSequence(); +}; + +bool X86InterleavedAccessGroup::isSupported() const { + VectorType *ShuffleVecTy = Shuffles[0]->getType(); + uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy); + Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); + + if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize) + return false; + + // Currently, lowering is supported for 64 bits on AVX. + if (!Subtarget.hasAVX() || ShuffleVecSize != 256 || + DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4) + return false; + + return true; +} + +bool X86InterleavedAccessGroup::decompose( + Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy, + SmallVectorImpl<Instruction *> &DecomposedVectors) { + Type *VecTy = VecInst->getType(); + (void)VecTy; + assert(VecTy->isVectorTy() && + DL.getTypeSizeInBits(VecTy) >= + DL.getTypeSizeInBits(SubVecTy) * NumSubVectors && + "Invalid Inst-size!!!"); + assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() && + "Element type mismatched!!!"); + + if (!isa<LoadInst>(VecInst)) + return false; + + LoadInst *LI = cast<LoadInst>(VecInst); + Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace()); + + Value *VecBasePtr = + Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); + + // Generate N loads of T type + for (unsigned i = 0; i < NumSubVectors; i++) { + // TODO: Support inbounds GEP + Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i)); + Instruction *NewLoad = + Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment()); + DecomposedVectors.push_back(NewLoad); + } + + return true; +} + +void X86InterleavedAccessGroup::transpose_4x4( + ArrayRef<Instruction *> Matrix, + SmallVectorImpl<Value *> &TransposedMatrix) { + assert(Matrix.size() == 4 && "Invalid matrix size"); + TransposedMatrix.resize(4); + + // dst = src1[0,1],src2[0,1] + uint32_t IntMask1[] = {0, 1, 4, 5}; + ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4); + Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); + Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); + + // dst = src1[2,3],src2[2,3] + uint32_t IntMask2[] = {2, 3, 6, 7}; + Mask = makeArrayRef(IntMask2, 4); + Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); + Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); + + // dst = src1[0],src2[0],src1[2],src2[2] + uint32_t IntMask3[] = {0, 4, 2, 6}; + Mask = makeArrayRef(IntMask3, 4); + TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); + TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); + + // dst = src1[1],src2[1],src1[3],src2[3] + uint32_t IntMask4[] = {1, 5, 3, 7}; + Mask = makeArrayRef(IntMask4, 4); + TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); + TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); +} + +// Lowers this interleaved access group into X86-specific +// instructions/intrinsics. +bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { + SmallVector<Instruction *, 4> DecomposedVectors; + VectorType *VecTy = Shuffles[0]->getType(); + // Try to generate target-sized register(/instruction). + if (!decompose(Inst, Factor, VecTy, DecomposedVectors)) + return false; + + SmallVector<Value *, 4> TransposedVectors; + // Perform matrix-transposition in order to compute interleaved + // results by generating some sort of (optimized) target-specific + // instructions. + transpose_4x4(DecomposedVectors, TransposedVectors); + + // Now replace the unoptimized-interleaved-vectors with the + // transposed-interleaved vectors. + for (unsigned i = 0; i < Shuffles.size(); i++) + Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); + + return true; +} + +// Lower interleaved load(s) into target specific instructions/ +// intrinsics. Lowering sequence varies depending on the vector-types, factor, +// number of shuffles and ISA. +// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. +bool X86TargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + ArrayRef<unsigned> Indices, unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + assert(!Shuffles.empty() && "Empty shufflevector input"); + assert(Shuffles.size() == Indices.size() && + "Unmatched number of shufflevectors and indices"); + + // Create an interleaved access group. + IRBuilder<> Builder(LI); + X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, + Builder); + + return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); +} diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h index b647d11..63a02af 100644 --- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -21,9 +21,10 @@ namespace llvm { enum IntrinsicType : uint16_t { INTR_NO_TYPE, - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS, - INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP, + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, + CVTPD2PS, CVTPD2PS_MASK, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK, INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK, @@ -33,7 +34,7 @@ enum IntrinsicType : uint16_t { INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, - EXPAND_FROM_MEM, INSERT_SUBVEC, + EXPAND_FROM_MEM, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; @@ -184,6 +185,79 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_128, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_256, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_512, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_128, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_256, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_512, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -228,6 +302,7 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0), X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0), + X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0), X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0), }; @@ -250,6 +325,11 @@ static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithoutChain[] = { + X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0), + X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), + X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), @@ -288,8 +368,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), @@ -353,21 +436,20 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), - X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), - X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), - X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), @@ -377,30 +459,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), - X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), - X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), - X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), - X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), - X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD, - X86ISD::FADD_RND), - X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD, - X86ISD::FADD_RND), - X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), - X86_INTRINSIC_DATA(avx512_mask_and_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), - X86_INTRINSIC_DATA(avx512_mask_and_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), - X86_INTRINSIC_DATA(avx512_mask_and_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), - X86_INTRINSIC_DATA(avx512_mask_and_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), - X86_INTRINSIC_DATA(avx512_mask_and_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), - X86_INTRINSIC_DATA(avx512_mask_andn_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), - X86_INTRINSIC_DATA(avx512_mask_andn_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), - X86_INTRINSIC_DATA(avx512_mask_andn_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), - X86_INTRINSIC_DATA(avx512_mask_andn_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), - X86_INTRINSIC_DATA(avx512_mask_andn_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), - X86_INTRINSIC_DATA(avx512_mask_andn_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FADD_RND, 0), X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC, X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC, @@ -452,10 +518,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, - X86ISD::FSETCC), - X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, - X86ISD::FSETCC), + X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, + X86ISD::FSETCCM, X86ISD::FSETCCM_RND), + X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, + X86ISD::FSETCCM, X86ISD::FSETCCM_RND), X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -495,184 +561,168 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CONFLICT, 0), X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK, X86ISD::CONFLICT, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK, - X86ISD::CVTDQ2PD, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, 0), // no rm X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK, ISD::SINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK, ISD::SINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er + ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, 0), + X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, 0), + X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK, X86ISD::VFPROUND, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, CVTPD2PS_MASK, ISD::FP_ROUND, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM, - ISD::FP_ROUND, X86ISD::VFPROUND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_MASK, + ISD::FP_ROUND, X86ISD::VFPROUND_RND), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, 0), + X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, 0), + X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, 0), + X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, 0), + X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, 0), + X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, 0), + X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, 0), + X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, 0), + X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK, ISD::FP_EXTEND, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK, - ISD::FP_EXTEND, X86ISD::VFPEXT), + ISD::FP_EXTEND, X86ISD::VFPEXT_RND), X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, 0), + X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, 0), + X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, 0), + X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, 0), + X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, 0), + X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, 0), + X86ISD::CVTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK, - X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK, ISD::SINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK, ISD::SINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, ISD::SINT_TO_FP), + ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, 0), + X86ISD::CVTSI2P, 0), X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK, ISD::SINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK, - ISD::SINT_TO_FP, ISD::SINT_TO_FP), + ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VFPROUND, 0), + X86ISD::VFPROUNDS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VFPEXT, 0), + X86ISD::VFPEXTS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK, - ISD::FP_TO_SINT, 0), + X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, - ISD::FP_TO_SINT, ISD::FP_TO_SINT), + ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK, - ISD::FP_TO_SINT, ISD::FP_TO_SINT), + ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK, - ISD::FP_TO_UINT, 0), + X86ISD::CVTTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_UINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK, - ISD::FP_TO_UINT, ISD::FP_TO_UINT), + ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND), X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK, ISD::FP_TO_UINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_UINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK, - ISD::FP_TO_UINT, ISD::FP_TO_UINT), + ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND), X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK, - ISD::FP_TO_SINT, ISD::FP_TO_SINT), + ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK, - ISD::FP_TO_SINT, 0), + X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK, - ISD::FP_TO_SINT, ISD::FP_TO_SINT), + ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND), X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK, ISD::FP_TO_UINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_UINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK, - ISD::FP_TO_UINT, ISD::FP_TO_UINT), + ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND), X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK, - ISD::FP_TO_UINT, 0), + X86ISD::CVTTP2UI, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK, ISD::FP_TO_UINT, 0), X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK, - ISD::FP_TO_UINT, ISD::FP_TO_UINT), - X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK, - X86ISD::CVTUDQ2PD, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, 0), - X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, 0), // no rm + ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND), X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK, ISD::UINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK, ISD::UINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, ISD::UINT_TO_FP), + ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK, ISD::UINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK, ISD::UINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, ISD::UINT_TO_FP), + ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, 0), + X86ISD::CVTUI2P, 0), X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK, ISD::UINT_TO_FP, 0), X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK, - ISD::UINT_TO_FP, ISD::UINT_TO_FP), + ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::DBPSADBW, 0), - X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), - X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, X86ISD::FDIV_RND), - X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), - X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV, X86ISD::FDIV_RND), - X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV, - X86ISD::FDIV_RND), - X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV, - X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FDIV_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FDIV_RND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, @@ -726,9 +776,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FGETEXP_RND, 0), + X86ISD::FGETEXPS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FGETEXP_RND, 0), + X86ISD::FGETEXPS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VGETMANT, 0), X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM, @@ -742,33 +792,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VGETMANT, 0), X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM, - X86ISD::VGETMANT, 0), + X86ISD::VGETMANTS, 0), X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM, - X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), - X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC, - ISD::INSERT_SUBVECTOR, 0), + X86ISD::VGETMANTS, 0), X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK, ISD::CTLZ, 0), X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK, @@ -790,9 +816,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMAX, X86ISD::FMAX_RND), + X86ISD::FMAX_RND, 0), X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMAX, X86ISD::FMAX_RND), + X86ISD::FMAX_RND, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, @@ -802,31 +828,17 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, X86ISD::FMIN_RND), X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMIN, X86ISD::FMIN_RND), + X86ISD::FMIN_RND, 0), X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, - X86ISD::MOVSD, 0), - X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, - X86ISD::MOVSS, 0), - X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), - X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86ISD::FMIN_RND, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86ISD::FMUL_RND), - X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), - X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86ISD::FMUL_RND), - X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL, - X86ISD::FMUL_RND), - X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL, - X86ISD::FMUL_RND), - X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), - X86_INTRINSIC_DATA(avx512_mask_or_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), - X86_INTRINSIC_DATA(avx512_mask_or_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), - X86_INTRINSIC_DATA(avx512_mask_or_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), - X86_INTRINSIC_DATA(avx512_mask_or_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), - X86_INTRINSIC_DATA(avx512_mask_or_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMUL_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMUL_RND, 0), X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), @@ -851,18 +863,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_b_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_b_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_b_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_d_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_d_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_d_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_q_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_q_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_q_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_w_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_w_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), - X86_INTRINSIC_DATA(avx512_mask_padd_w_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), @@ -945,54 +945,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK, X86ISD::VPMADDWD, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, @@ -1065,42 +1017,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNCS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCS, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_128, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_256, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_512, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_128, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_256, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_512, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_128, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_256, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_512, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_128, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_256, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_512, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_128, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_256, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_512, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_128, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_256, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_512, INTR_TYPE_1OP_MASK, - X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK, @@ -1137,48 +1053,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCUS, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_128, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_256, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_512, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_128, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_256, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_512, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_128, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_256, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_512, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_128, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_256, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_512, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_128, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_256, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_512, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_128, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_256, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_512, INTR_TYPE_1OP_MASK, - X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, - X86ISD::PMULDQ, 0), - X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, - X86ISD::PMULDQ, 0), - X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK, - X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), @@ -1188,27 +1062,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_q_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_q_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_q_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_w_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_w_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), - X86_INTRINSIC_DATA(avx512_mask_pmull_w_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK, X86ISD::MULTISHIFT, 0), X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK, X86ISD::MULTISHIFT, 0), X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK, X86ISD::MULTISHIFT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_128, INTR_TYPE_2OP_MASK, - X86ISD::PMULUDQ, 0), - X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_256, INTR_TYPE_2OP_MASK, - X86ISD::PMULUDQ, 0), - X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_512, INTR_TYPE_2OP_MASK, - X86ISD::PMULUDQ, 0), X86_INTRINSIC_DATA(avx512_mask_prol_d_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0), X86_INTRINSIC_DATA(avx512_mask_prol_d_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0), X86_INTRINSIC_DATA(avx512_mask_prol_d_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0), @@ -1233,105 +1092,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), - X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK, - X86ISD::PSHUFB, 0), - X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK, - X86ISD::PSHUFB, 0), - X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK, - X86ISD::PSHUFB, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_di_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_di_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_di_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_qi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_qi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_qi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_wi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_wi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psll_wi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv16_hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv2_di, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv32hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv4_di, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv4_si, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv8_hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psllv8_si, INTR_TYPE_2OP_MASK, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_di_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_di_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_di_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_qi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_qi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_qi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_wi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_wi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav16_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav32_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav4_si, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav8_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav8_si, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_di_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_di_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_di_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv2_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv32hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv4_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv4_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psrlv8_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_b_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_b_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_d_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_d_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_d_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_q_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_q_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_q_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_w_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_w_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), - X86_INTRINSIC_DATA(avx512_mask_psub_w_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), @@ -1370,8 +1130,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), @@ -1379,9 +1139,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VRNDSCALE, 0), + X86ISD::VRNDSCALES, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VRNDSCALE, 0), + X86ISD::VRNDSCALES, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, @@ -1414,42 +1174,26 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUFP, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUFP, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUFP, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUFP, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUFP, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUFP, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, + X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, + X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSQRT_RND, 0), + X86ISD::FSQRTS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSQRT_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), - X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86ISD::FSQRTS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, X86ISD::FSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), - X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, X86ISD::FSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB, - X86ISD::FSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB, - X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSUB_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSUB_RND, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0), @@ -1462,30 +1206,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::VALIGN, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::VALIGN, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::VALIGN, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::VALIGN, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::VALIGN, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::VALIGN, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM, - ISD::FP16_TO_FP, 0), + X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM, - ISD::FP16_TO_FP, 0), + X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM, - ISD::FP16_TO_FP, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM, - ISD::FP_TO_FP16, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM, - ISD::FP_TO_FP16, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM, - ISD::FP_TO_FP16, 0), + X86ISD::CVTPH2PS, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK, + X86ISD::CVTPS2PH, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK, + X86ISD::CVTPS2PH, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK, + X86ISD::CVTPS2PH, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, @@ -1495,8 +1227,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, @@ -1555,23 +1287,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), + X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), + X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK, - X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK, - X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK, - X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK, - X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK, - X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK, - X86ISD::VPERMILPV, 0), + X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, @@ -1620,12 +1340,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK, X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), - X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), - X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), - X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), - X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), - X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD, @@ -1635,8 +1349,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB, @@ -1654,6 +1368,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB, X86ISD::FMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), @@ -1672,6 +1388,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, @@ -1709,8 +1427,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, @@ -1768,7 +1486,49 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ, X86ISD::VPMADD52L, 0), + X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), + X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_psll_q_512, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_psll_w_512, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_psra_q_512, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_psra_w_512, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_psrai_d_512, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_psrai_q_128, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_psrai_q_256, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_psrai_q_512, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_psrai_w_512, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_psrav_d_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_psrav_q_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_psrav_q_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_psrav_q_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_psrav_w_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_psrav_w_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_psrav_w_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_psrl_d_512, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_psrl_q_512, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_psrl_w_512, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_ptestm_b_128, CMP_MASK, X86ISD::TESTM, 0), X86_INTRINSIC_DATA(avx512_ptestm_b_256, CMP_MASK, X86ISD::TESTM, 0), X86_INTRINSIC_DATA(avx512_ptestm_b_512, CMP_MASK, X86ISD::TESTM, 0), @@ -1803,8 +1563,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0), X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), @@ -1815,26 +1575,20 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), - X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), - X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, - X86ISD::SCALAR_FP_TO_SINT_RND, 0), - X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_2OP, - X86ISD::SCALAR_FP_TO_SINT_RND, 0), - X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_2OP, - X86ISD::SCALAR_FP_TO_UINT_RND, 0), - X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_2OP, - X86ISD::SCALAR_FP_TO_UINT_RND, 0), - X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_2OP, - X86ISD::SCALAR_FP_TO_SINT_RND, 0), - X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, - X86ISD::SCALAR_FP_TO_SINT_RND, 0), - X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, - X86ISD::SCALAR_FP_TO_UINT_RND, 0), - X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, - X86ISD::SCALAR_FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0), + X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0), + X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0), + X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0), + X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0), + X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0), @@ -1883,6 +1637,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE), X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT), X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(sse2_cvtpd2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), + X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), + X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), @@ -1895,6 +1654,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), @@ -1943,6 +1703,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index 906e342..feeb2fd 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -16,6 +16,7 @@ #include "X86RegisterInfo.h" #include "X86ShuffleDecodeConstantPool.h" #include "InstPrinter/X86ATTInstPrinter.h" +#include "InstPrinter/X86InstComments.h" #include "MCTargetDesc/X86BaseInfo.h" #include "Utils/X86ShuffleDecode.h" #include "llvm/ADT/Optional.h" @@ -41,6 +42,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/ELF.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -68,9 +70,6 @@ public: private: MachineModuleInfoMachO &getMachOMMI() const; - Mangler *getMang() const { - return AsmPrinter.Mang; - } }; } // end anonymous namespace @@ -499,18 +498,13 @@ ReSimplify: break; } - // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions. - case X86::TAILJMPr: + // TAILJMPd, TAILJMPd64 - Lower to the correct jump instruction. + { unsigned Opcode; + case X86::TAILJMPr: Opcode = X86::JMP32r; goto SetTailJmpOpcode; case X86::TAILJMPd: - case X86::TAILJMPd64: { - unsigned Opcode; - switch (OutMI.getOpcode()) { - default: llvm_unreachable("Invalid opcode"); - case X86::TAILJMPr: Opcode = X86::JMP32r; break; - case X86::TAILJMPd: - case X86::TAILJMPd64: Opcode = X86::JMP_1; break; - } + case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode; + SetTailJmpOpcode: MCOperand Saved = OutMI.getOperand(0); OutMI = MCInst(); OutMI.setOpcode(Opcode); @@ -979,8 +973,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, PatchPointOpers opers(&MI); unsigned ScratchIdx = opers.getNextScratchIdx(); unsigned EncodedBytes = 0; - const MachineOperand &CalleeMO = - opers.getMetaOper(PatchPointOpers::TargetPos); + const MachineOperand &CalleeMO = opers.getCallTarget(); // Check for null target. If target is non-null (i.e. is non-zero or is // symbolic) then emit a call. @@ -1016,7 +1009,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, } // Emit padding. - unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); + unsigned NumBytes = opers.getNumPatchBytes(); assert(NumBytes >= EncodedBytes && "Patchpoint can't request size less than the length of a call."); @@ -1024,22 +1017,12 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, getSubtargetInfo()); } -void X86AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI, - SledKind Kind) { - auto Fn = MI.getParent()->getParent()->getFunction(); - auto Attr = Fn->getFnAttribute("function-instrument"); - bool AlwaysInstrument = - Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always"; - Sleds.emplace_back( - XRayFunctionEntry{Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn}); -} - void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, X86MCInstLower &MCIL) { // We want to emit the following pattern: // + // .p2align 1, ... // .Lxray_sled_N: - // .palign 2, ... // jmp .tmpN // # 9 bytes worth of noops // .tmpN @@ -1051,8 +1034,8 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, // call <relative offset, 32-bits> // 5 bytes // auto CurSled = OutContext.createTempSymbol("xray_sled_", true); + OutStreamer->EmitCodeAlignment(2); OutStreamer->EmitLabel(CurSled); - OutStreamer->EmitCodeAlignment(4); auto Target = OutContext.createTempSymbol(); // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as @@ -1074,12 +1057,14 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, // // We should emit the RET followed by sleds. // + // .p2align 1, ... // .Lxray_sled_N: // ret # or equivalent instruction // # 10 bytes worth of noops // // This just makes sure that the alignment for the next instruction is 2. auto CurSled = OutContext.createTempSymbol("xray_sled_", true); + OutStreamer->EmitCodeAlignment(2); OutStreamer->EmitLabel(CurSled); unsigned OpCode = MI.getOperand(0).getImm(); MCInst Ret; @@ -1092,29 +1077,37 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, recordSled(CurSled, MI, SledKind::FUNCTION_EXIT); } -void X86AsmPrinter::EmitXRayTable() { - if (Sleds.empty()) - return; - if (Subtarget->isTargetELF()) { - auto *Section = OutContext.getELFSection( - "xray_instr_map", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | ELF::SHF_GROUP | ELF::SHF_MERGE, 0, - CurrentFnSym->getName()); - auto PrevSection = OutStreamer->getCurrentSectionOnly(); - OutStreamer->SwitchSection(Section); - for (const auto &Sled : Sleds) { - OutStreamer->EmitSymbolValue(Sled.Sled, 8); - OutStreamer->EmitSymbolValue(CurrentFnSym, 8); - auto Kind = static_cast<uint8_t>(Sled.Kind); - OutStreamer->EmitBytes( - StringRef(reinterpret_cast<const char *>(&Kind), 1)); - OutStreamer->EmitBytes( - StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1)); - OutStreamer->EmitZeros(14); - } - OutStreamer->SwitchSection(PrevSection); - } - Sleds.clear(); +void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) { + // Like PATCHABLE_RET, we have the actual instruction in the operands to this + // instruction so we lower that particular instruction and its operands. + // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how + // we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to + // the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual + // tail call much like how we have it in PATCHABLE_RET. + auto CurSled = OutContext.createTempSymbol("xray_sled_", true); + OutStreamer->EmitCodeAlignment(2); + OutStreamer->EmitLabel(CurSled); + auto Target = OutContext.createTempSymbol(); + + // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as + // an operand (computed as an offset from the jmp instruction). + // FIXME: Find another less hacky way do force the relative jump. + OutStreamer->EmitBytes("\xeb\x09"); + EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo()); + OutStreamer->EmitLabel(Target); + recordSled(CurSled, MI, SledKind::TAIL_CALL); + + unsigned OpCode = MI.getOperand(0).getImm(); + MCInst TC; + TC.setOpcode(OpCode); + + // Before emitting the instruction, add a comment to indicate that this is + // indeed a tail call. + OutStreamer->AddComment("TAILCALL"); + for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end())) + if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) + TC.addOperand(MaybeOperand.getValue()); + OutStreamer->EmitInstruction(TC, getSubtargetInfo()); } // Returns instruction preceding MBBI in MachineFunction. @@ -1152,9 +1145,9 @@ static const Constant *getConstantFromPool(const MachineInstr &MI, return C; } -static std::string getShuffleComment(const MachineOperand &DstOp, - const MachineOperand &SrcOp1, - const MachineOperand &SrcOp2, +static std::string getShuffleComment(const MachineInstr *MI, + unsigned SrcOp1Idx, + unsigned SrcOp2Idx, ArrayRef<int> Mask) { std::string Comment; @@ -1167,7 +1160,10 @@ static std::string getShuffleComment(const MachineOperand &DstOp, return X86ATTInstPrinter::getRegisterName(RegNum); }; - // TODO: Add support for specifying an AVX512 style mask register in the comment. + const MachineOperand &DstOp = MI->getOperand(0); + const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx); + const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx); + StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem"; StringRef Src1Name = SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem"; @@ -1182,7 +1178,26 @@ static std::string getShuffleComment(const MachineOperand &DstOp, ShuffleMask[i] -= e; raw_string_ostream CS(Comment); - CS << DstName << " = "; + CS << DstName; + + // Handle AVX512 MASK/MASXZ write mask comments. + // MASK: zmmX {%kY} + // MASKZ: zmmX {%kY} {z} + if (SrcOp1Idx > 1) { + assert((SrcOp1Idx == 2 || SrcOp1Idx == 3) && "Unexpected writemask"); + + const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1); + if (WriteMaskOp.isReg()) { + CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}"; + + if (SrcOp1Idx == 2) { + CS << " {z}"; + } + } + } + + CS << " = "; + for (int i = 0, e = ShuffleMask.size(); i != e; ++i) { if (i != 0) CS << ","; @@ -1221,6 +1236,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); + // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that + // are compressed from EVEX encoding to VEX encoding. + if (TM.Options.MCOptions.ShowMCEncoding) { + if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX) + OutStreamer->AddComment("EVEX TO VEX Compression ", false); + } + switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); @@ -1259,7 +1281,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::TAILJMPd64: case X86::TAILJMPr64_REX: case X86::TAILJMPm64_REX: - case X86::TAILJMPd64_REX: // Lower these as normal, but add some comments. OutStreamer->AddComment("TAILCALL"); break; @@ -1364,6 +1385,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case TargetOpcode::PATCHABLE_RET: return LowerPATCHABLE_RET(*MI, MCInstLowering); + case TargetOpcode::PATCHABLE_TAIL_CALL: + return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering); + case X86::MORESTACK_RET: EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); return; @@ -1377,37 +1401,45 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; case X86::SEH_PushReg: + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm())); return; case X86::SEH_SaveReg: + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), MI->getOperand(1).getImm()); return; case X86::SEH_SaveXMM: + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), MI->getOperand(1).getImm()); return; case X86::SEH_StackAlloc: + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm()); return; case X86::SEH_SetFrame: + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()), MI->getOperand(1).getImm()); return; case X86::SEH_PushFrame: + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm()); return; case X86::SEH_EndPrologue: + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); OutStreamer->EmitWinCFIEndProlog(); return; case X86::SEH_Epilogue: { + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); MachineBasicBlock::const_iterator MBBI(MI); // Check if preceded by a call and emit nop if so. for (MBBI = PrevCrossBBInst(MBBI); @@ -1463,59 +1495,84 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { assert(MI->getNumOperands() >= 6 && "We should always have at least 6 operands!"); - const MachineOperand &DstOp = MI->getOperand(0); - const MachineOperand &SrcOp = MI->getOperand(SrcIdx); - const MachineOperand &MaskOp = MI->getOperand(MaskIdx); + const MachineOperand &MaskOp = MI->getOperand(MaskIdx); if (auto *C = getConstantFromPool(*MI, MaskOp)) { - SmallVector<int, 16> Mask; + SmallVector<int, 64> Mask; DecodePSHUFBMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); } break; } + case X86::VPERMILPSrm: + case X86::VPERMILPSYrm: + case X86::VPERMILPSZ128rm: + case X86::VPERMILPSZ128rmk: + case X86::VPERMILPSZ128rmkz: + case X86::VPERMILPSZ256rm: + case X86::VPERMILPSZ256rmk: + case X86::VPERMILPSZ256rmkz: + case X86::VPERMILPSZrm: + case X86::VPERMILPSZrmk: + case X86::VPERMILPSZrmkz: case X86::VPERMILPDrm: case X86::VPERMILPDYrm: case X86::VPERMILPDZ128rm: + case X86::VPERMILPDZ128rmk: + case X86::VPERMILPDZ128rmkz: case X86::VPERMILPDZ256rm: - case X86::VPERMILPDZrm: { + case X86::VPERMILPDZ256rmk: + case X86::VPERMILPDZ256rmkz: + case X86::VPERMILPDZrm: + case X86::VPERMILPDZrmk: + case X86::VPERMILPDZrmkz: { if (!OutStreamer->isVerboseAsm()) break; - assert(MI->getNumOperands() > 5 && - "We should always have at least 5 operands!"); - const MachineOperand &DstOp = MI->getOperand(0); - const MachineOperand &SrcOp = MI->getOperand(1); - const MachineOperand &MaskOp = MI->getOperand(5); - - if (auto *C = getConstantFromPool(*MI, MaskOp)) { - SmallVector<int, 8> Mask; - DecodeVPERMILPMask(C, 64, Mask); - if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask)); + unsigned SrcIdx, MaskIdx; + unsigned ElSize; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VPERMILPSrm: + case X86::VPERMILPSYrm: + case X86::VPERMILPSZ128rm: + case X86::VPERMILPSZ256rm: + case X86::VPERMILPSZrm: + SrcIdx = 1; MaskIdx = 5; ElSize = 32; break; + case X86::VPERMILPSZ128rmkz: + case X86::VPERMILPSZ256rmkz: + case X86::VPERMILPSZrmkz: + SrcIdx = 2; MaskIdx = 6; ElSize = 32; break; + case X86::VPERMILPSZ128rmk: + case X86::VPERMILPSZ256rmk: + case X86::VPERMILPSZrmk: + SrcIdx = 3; MaskIdx = 7; ElSize = 32; break; + case X86::VPERMILPDrm: + case X86::VPERMILPDYrm: + case X86::VPERMILPDZ128rm: + case X86::VPERMILPDZ256rm: + case X86::VPERMILPDZrm: + SrcIdx = 1; MaskIdx = 5; ElSize = 64; break; + case X86::VPERMILPDZ128rmkz: + case X86::VPERMILPDZ256rmkz: + case X86::VPERMILPDZrmkz: + SrcIdx = 2; MaskIdx = 6; ElSize = 64; break; + case X86::VPERMILPDZ128rmk: + case X86::VPERMILPDZ256rmk: + case X86::VPERMILPDZrmk: + SrcIdx = 3; MaskIdx = 7; ElSize = 64; break; } - break; - } - case X86::VPERMILPSrm: - case X86::VPERMILPSYrm: - case X86::VPERMILPSZ128rm: - case X86::VPERMILPSZ256rm: - case X86::VPERMILPSZrm: { - if (!OutStreamer->isVerboseAsm()) - break; - assert(MI->getNumOperands() > 5 && - "We should always have at least 5 operands!"); - const MachineOperand &DstOp = MI->getOperand(0); - const MachineOperand &SrcOp = MI->getOperand(1); - const MachineOperand &MaskOp = MI->getOperand(5); + assert(MI->getNumOperands() >= 6 && + "We should always have at least 6 operands!"); + const MachineOperand &MaskOp = MI->getOperand(MaskIdx); if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; - DecodeVPERMILPMask(C, 32, Mask); + DecodeVPERMILPMask(C, ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); } break; } @@ -1526,14 +1583,10 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPERMIL2PSrmY: { if (!OutStreamer->isVerboseAsm()) break; - assert(MI->getNumOperands() > 7 && - "We should always have at least 7 operands!"); - const MachineOperand &DstOp = MI->getOperand(0); - const MachineOperand &SrcOp1 = MI->getOperand(1); - const MachineOperand &SrcOp2 = MI->getOperand(2); - const MachineOperand &MaskOp = MI->getOperand(6); - const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1); + assert(MI->getNumOperands() >= 8 && + "We should always have at least 8 operands!"); + const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1); if (!CtrlOp.isImm()) break; @@ -1544,11 +1597,12 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break; } + const MachineOperand &MaskOp = MI->getOperand(6); if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp1, SrcOp2, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); } break; } @@ -1556,18 +1610,15 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPPERMrrm: { if (!OutStreamer->isVerboseAsm()) break; - assert(MI->getNumOperands() > 6 && - "We should always have at least 6 operands!"); - const MachineOperand &DstOp = MI->getOperand(0); - const MachineOperand &SrcOp1 = MI->getOperand(1); - const MachineOperand &SrcOp2 = MI->getOperand(2); - const MachineOperand &MaskOp = MI->getOperand(6); + assert(MI->getNumOperands() >= 7 && + "We should always have at least 7 operands!"); + const MachineOperand &MaskOp = MI->getOperand(6); if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; DecodeVPPERMMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp1, SrcOp2, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); } break; } @@ -1605,7 +1656,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { CASE_ALL_MOV_RM() if (!OutStreamer->isVerboseAsm()) break; - if (MI->getNumOperands() > 4) + if (MI->getNumOperands() <= 4) + break; if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { std::string Comment; raw_string_ostream CS(Comment); diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index 4da0fdd..e144700 100644 --- a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -44,12 +44,6 @@ static cl::opt<bool> STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed"); -class MemOpKey; - -/// \brief Returns a hash table key based on memory operands of \p MI. The -/// number of the first memory operand of \p MI is specified through \p N. -static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N); - /// \brief Returns true if two machine operands are identical and they are not /// physical registers. static inline bool isIdenticalOp(const MachineOperand &MO1, @@ -63,6 +57,7 @@ static bool isSimilarDispOp(const MachineOperand &MO1, /// \brief Returns true if the instruction is LEA. static inline bool isLEA(const MachineInstr &MI); +namespace { /// A key based on instruction's memory operands. class MemOpKey { public: @@ -95,6 +90,7 @@ public: // Address' displacement operand. const MachineOperand *Disp; }; +} // end anonymous namespace /// Provide DenseMapInfo for MemOpKey. namespace llvm { @@ -168,6 +164,8 @@ template <> struct DenseMapInfo<MemOpKey> { }; } +/// \brief Returns a hash table key based on memory operands of \p MI. The +/// number of the first memory operand of \p MI is specified through \p N. static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) { assert((isLEA(MI) || MI.mayLoadOrStore()) && "The instruction must be a LEA, a load or a store"); @@ -221,7 +219,7 @@ class OptimizeLEAPass : public MachineFunctionPass { public: OptimizeLEAPass() : MachineFunctionPass(ID) {} - const char *getPassName() const override { return "X86 LEA Optimize"; } + StringRef getPassName() const override { return "X86 LEA Optimize"; } /// \brief Loop over all of the basic blocks, replacing address /// calculations in load and store instructions, if it's already @@ -237,7 +235,7 @@ private: /// \brief Choose the best \p LEA instruction from the \p List to replace /// address calculation in \p MI instruction. Return the address displacement - /// and the distance between \p MI and the choosen \p BestLEA in + /// and the distance between \p MI and the chosen \p BestLEA in /// \p AddrDispShift and \p Dist. bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, const MachineInstr &MI, MachineInstr *&BestLEA, @@ -551,10 +549,10 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { MachineInstr &Last = **I2; int64_t AddrDispShift; - // LEAs should be in occurence order in the list, so we can freely + // LEAs should be in occurrence order in the list, so we can freely // replace later LEAs with earlier ones. assert(calcInstrDist(First, Last) > 0 && - "LEAs must be in occurence order in the list"); + "LEAs must be in occurrence order in the list"); // Check that the Last LEA instruction can be replaced by the First. if (!isReplaceable(First, Last, AddrDispShift)) { diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp index 62a9aaf..3069d1f 100644 --- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -57,10 +57,10 @@ namespace { MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( - MachineFunctionProperties::Property::AllVRegsAllocated); + MachineFunctionProperties::Property::NoVRegs); } - const char *getPassName() const override { + StringRef getPassName() const override { return "X86 Atom pad short functions"; } diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index 8675063..65f438f 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -128,21 +128,44 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, if (RC == &X86::GR8_NOREXRegClass) return RC; + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + const TargetRegisterClass *Super = RC; TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); do { switch (Super->getID()) { + case X86::FR32RegClassID: + case X86::FR64RegClassID: + // If AVX-512 isn't supported we should only inflate to these classes. + if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize()) + return Super; + break; + case X86::VR128RegClassID: + case X86::VR256RegClassID: + // If VLX isn't supported we should only inflate to these classes. + if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize()) + return Super; + break; + case X86::VR128XRegClassID: + case X86::VR256XRegClassID: + // If VLX isn't support we shouldn't inflate to these classes. + if (Subtarget.hasVLX() && Super->getSize() == RC->getSize()) + return Super; + break; + case X86::FR32XRegClassID: + case X86::FR64XRegClassID: + // If AVX-512 isn't support we shouldn't inflate to these classes. + if (Subtarget.hasAVX512() && Super->getSize() == RC->getSize()) + return Super; + break; case X86::GR8RegClassID: case X86::GR16RegClassID: case X86::GR32RegClassID: case X86::GR64RegClassID: - case X86::FR32RegClassID: - case X86::FR64RegClassID: case X86::RFP32RegClassID: case X86::RFP64RegClassID: case X86::RFP80RegClassID: - case X86::VR128RegClassID: - case X86::VR256RegClassID: + case X86::VR512RegClassID: // Don't return a super-class that would shrink the spill size. // That can happen with the vector and float classes. if (Super->getSize() == RC->getSize()) @@ -241,13 +264,14 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + assert(MF && "MachineFunction required"); + const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); - bool CallsEHReturn = MF->getMMI().callsEHReturn(); + bool CallsEHReturn = MF->callsEHReturn(); - assert(MF && "MachineFunction required"); switch (MF->getFunction()->getCallingConv()) { case CallingConv::GHC: case CallingConv::HiPE: @@ -282,11 +306,26 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } case CallingConv::HHVM: return CSR_64_HHVM_SaveList; + case CallingConv::X86_RegCall: + if (Is64Bit) { + if (IsWin64) { + return (HasSSE ? CSR_Win64_RegCall_SaveList : + CSR_Win64_RegCall_NoSSE_SaveList); + } else { + return (HasSSE ? CSR_SysV64_RegCall_SaveList : + CSR_SysV64_RegCall_NoSSE_SaveList); + } + } else { + return (HasSSE ? CSR_32_RegCall_SaveList : + CSR_32_RegCall_NoSSE_SaveList); + } case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_SaveList; break; case CallingConv::X86_64_Win64: + if (!HasSSE) + return CSR_Win64_NoSSE_SaveList; return CSR_Win64_SaveList; case CallingConv::X86_64_SysV: if (CallsEHReturn) @@ -313,8 +352,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } if (Is64Bit) { - if (IsWin64) + if (IsWin64) { + if (!HasSSE) + return CSR_Win64_NoSSE_SaveList; return CSR_Win64_SaveList; + } if (CallsEHReturn) return CSR_64EHRet_SaveList; if (Subtarget.getTargetLowering()->supportSwiftError() && @@ -378,6 +420,19 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, } case CallingConv::HHVM: return CSR_64_HHVM_RegMask; + case CallingConv::X86_RegCall: + if (Is64Bit) { + if (IsWin64) { + return (HasSSE ? CSR_Win64_RegCall_RegMask : + CSR_Win64_RegCall_NoSSE_RegMask); + } else { + return (HasSSE ? CSR_SysV64_RegCall_RegMask : + CSR_SysV64_RegCall_NoSSE_RegMask); + } + } else { + return (HasSSE ? CSR_32_RegCall_RegMask : + CSR_32_RegCall_NoSSE_RegMask); + } case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_RegMask; @@ -503,6 +558,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { } } + assert(checkAllSuperRegsMarked(Reserved, + {X86::SIL, X86::DIL, X86::BPL, X86::SPL})); return Reserved; } @@ -526,12 +583,12 @@ void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Stack Frame Processing methods //===----------------------------------------------------------------------===// -static bool CantUseSP(const MachineFrameInfo *MFI) { - return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); +static bool CantUseSP(const MachineFrameInfo &MFI) { + return MFI.hasVarSizedObjects() || MFI.hasOpaqueSPAdjustment(); } bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); if (!EnableBasePointer) return false; @@ -549,7 +606,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { if (!TargetRegisterInfo::canRealignStack(MF)) return false; - const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); // Stack realignment requires a frame pointer. If we already started @@ -571,6 +628,35 @@ bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, llvm_unreachable("Unused function on X86. Otherwise need a test case."); } +// tryOptimizeLEAtoMOV - helper function that tries to replace a LEA instruction +// of the form 'lea (%esp), %ebx' --> 'mov %esp, %ebx'. +// TODO: In this case we should be really trying first to entirely eliminate +// this instruction which is a plain copy. +static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) { + MachineInstr &MI = *II; + unsigned Opc = II->getOpcode(); + // Check if this is a LEA of the form 'lea (%esp), %ebx' + if ((Opc != X86::LEA32r && Opc != X86::LEA64r && Opc != X86::LEA64_32r) || + MI.getOperand(2).getImm() != 1 || + MI.getOperand(3).getReg() != X86::NoRegister || + MI.getOperand(4).getImm() != 0 || + MI.getOperand(5).getReg() != X86::NoRegister) + return false; + unsigned BasePtr = MI.getOperand(1).getReg(); + // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will + // be replaced with a 32-bit operand MOV which will zero extend the upper + // 32-bits of the super register. + if (Opc == X86::LEA64_32r) + BasePtr = getX86SubSuperRegister(BasePtr, 32); + unsigned NewDestReg = MI.getOperand(0).getReg(); + const X86InstrInfo *TII = + MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo(); + TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr, + MI.getOperand(1).isKill()); + MI.eraseFromParent(); + return true; +} + void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, @@ -611,19 +697,21 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. + // Don't change BasePtr since it is used later for stack adjustment. + unsigned MachineBasePtr = BasePtr; if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) - BasePtr = getX86SubSuperRegister(BasePtr, 64); + MachineBasePtr = getX86SubSuperRegister(BasePtr, 64); // This must be part of a four operand memory reference. Replace the - // FrameIndex with base register with EBP. Add an offset to the offset. - MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false); + // FrameIndex with base register. Add an offset to the offset. + MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false); // Now add the frame object offset to the offset from EBP. int FIOffset; if (AfterFPPop) { // Tail call jmp happens after FP is popped. - const MachineFrameInfo *MFI = MF.getFrameInfo(); - FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + FIOffset = MFI.getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); } else FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); @@ -645,7 +733,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int Offset = FIOffset + Imm; assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && "Requesting 64-bit offset in 32-bit immediate!"); - MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); + if (Offset != 0 || !tryOptimizeLEAtoMOV(II)) + MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); } else { // Offset is symbolic. This is extremely rare. uint64_t Offset = FIOffset + @@ -667,13 +756,3 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; } - -unsigned llvm::get512BitSuperRegister(unsigned Reg) { - if (Reg >= X86::XMM0 && Reg <= X86::XMM31) - return X86::ZMM0 + (Reg - X86::XMM0); - if (Reg >= X86::YMM0 && Reg <= X86::YMM31) - return X86::ZMM0 + (Reg - X86::YMM0); - if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31) - return Reg; - llvm_unreachable("Unexpected SIMD register"); -} diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index 8d0094c..58fa31e 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -100,7 +100,7 @@ public: const MCPhysReg * getCalleeSavedRegs(const MachineFunction* MF) const override; const MCPhysReg * - getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const override; @@ -137,9 +137,6 @@ public: unsigned getSlotSize() const { return SlotSize; } }; -//get512BitRegister - X86 utility - returns 512-bit super register -unsigned get512BitSuperRegister(unsigned Reg); - } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td index 373f9b4..372a15a 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -345,6 +345,8 @@ def GR32 : RegisterClass<"X86", [i32], 32, // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since // RIP isn't really a register and it can't be used anywhere except in an // address, but it doesn't cause trouble. +// FIXME: it *does* cause trouble - CheckBaseRegAndIndexReg() has extra +// tests because of the inclusion of RIP in this register class. def GR64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, RBX, R14, R15, R12, R13, RBP, RSP, RIP)>; diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index d02859b..f031a28 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -31,8 +31,8 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( // alignment requirements. Fall back to generic code if there are any // dynamic stack adjustments (hopefully rare) and the base pointer would // conflict if we had to use it. - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - if (!MFI->hasVarSizedObjects() && !MFI->hasOpaqueSPAdjustment()) + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment()) return false; const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>( diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp index 1adc92c..1111552 100644 --- a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -14,6 +14,7 @@ #include "X86ShuffleDecodeConstantPool.h" #include "Utils/X86ShuffleDecode.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Constants.h" @@ -23,10 +24,12 @@ namespace llvm { -void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - // It is not an error for the PSHUFB mask to not be a vector of i8 because the - // constant pool uniques constants by their bit representation. +static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits, + SmallBitVector &UndefElts, + SmallVectorImpl<uint64_t> &RawMask) { + // It is not an error for shuffle masks to not be a vector of + // MaskEltSizeInBits because the constant pool uniques constants by their + // bit representation. // e.g. the following take up the same space in the constant pool: // i128 -170141183420855150465331762880109871104 // @@ -34,165 +37,161 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { // // <4 x i32> <i32 -2147483648, i32 -2147483648, // i32 -2147483648, i32 -2147483648> + Type *CstTy = C->getType(); + if (!CstTy->isVectorTy()) + return false; + + Type *CstEltTy = CstTy->getVectorElementType(); + if (!CstEltTy->isIntegerTy()) + return false; + + unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); + unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); + unsigned NumCstElts = CstTy->getVectorNumElements(); + + // Extract all the undef/constant element data and pack into single bitsets. + APInt UndefBits(CstSizeInBits, 0); + APInt MaskBits(CstSizeInBits, 0); + for (unsigned i = 0; i != NumCstElts; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return false; -#ifndef NDEBUG - unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512); -#endif + if (isa<UndefValue>(COp)) { + APInt EltUndef = APInt::getLowBitsSet(CstSizeInBits, CstEltSizeInBits); + UndefBits |= EltUndef.shl(i * CstEltSizeInBits); + continue; + } - if (!MaskTy->isVectorTy()) - return; - int NumElts = MaskTy->getVectorNumElements(); + APInt EltBits = cast<ConstantInt>(COp)->getValue(); + EltBits = EltBits.zextOrTrunc(CstSizeInBits); + MaskBits |= EltBits.shl(i * CstEltSizeInBits); + } - Type *EltTy = MaskTy->getVectorElementType(); - if (!EltTy->isIntegerTy()) - return; + // Now extract the undef/constant bit data into the raw shuffle masks. + assert((CstSizeInBits % MaskEltSizeInBits) == 0 && + "Unaligned shuffle mask size"); - // The shuffle mask requires a byte vector - decode cases with - // wider elements as well. - unsigned BitWidth = cast<IntegerType>(EltTy)->getBitWidth(); - if ((BitWidth % 8) != 0) + unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits; + UndefElts = SmallBitVector(NumMaskElts, false); + RawMask.resize(NumMaskElts, 0); + + for (unsigned i = 0; i != NumMaskElts; ++i) { + APInt EltUndef = UndefBits.lshr(i * MaskEltSizeInBits); + EltUndef = EltUndef.zextOrTrunc(MaskEltSizeInBits); + + // Only treat the element as UNDEF if all bits are UNDEF, otherwise + // treat it as zero. + if (EltUndef.isAllOnesValue()) { + UndefElts[i] = true; + RawMask[i] = 0; + continue; + } + + APInt EltBits = MaskBits.lshr(i * MaskEltSizeInBits); + EltBits = EltBits.zextOrTrunc(MaskEltSizeInBits); + RawMask[i] = EltBits.getZExtValue(); + } + + return true; +} + +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + (void)MaskTySize; + assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) && + "Unexpected vector size."); + + // The shuffle mask requires a byte vector. + SmallBitVector UndefElts; + SmallVector<uint64_t, 32> RawMask; + if (!extractConstantMask(C, 8, UndefElts, RawMask)) return; - int Scale = BitWidth / 8; - int NumBytes = NumElts * Scale; - ShuffleMask.reserve(NumBytes); + unsigned NumElts = RawMask.size(); + assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && + "Unexpected number of vector elements."); - for (int i = 0; i != NumElts; ++i) { - Constant *COp = C->getAggregateElement(i); - if (!COp) { - ShuffleMask.clear(); - return; - } else if (isa<UndefValue>(COp)) { - ShuffleMask.append(Scale, SM_SentinelUndef); + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { + ShuffleMask.push_back(SM_SentinelUndef); continue; } - APInt APElt = cast<ConstantInt>(COp)->getValue(); - for (int j = 0; j != Scale; ++j) { + uint64_t Element = RawMask[i]; + // If the high bit (7) of the byte is set, the element is zeroed. + if (Element & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte // lane of the vector we're inside. - int Base = ((i * Scale) + j) & ~0xf; - - uint64_t Element = APElt.getLoBits(8).getZExtValue(); - APElt = APElt.lshr(8); - - // If the high bit (7) of the byte is set, the element is zeroed. - if (Element & (1 << 7)) - ShuffleMask.push_back(SM_SentinelZero); - else { - // Only the least significant 4 bits of the byte are used. - int Index = Base + (Element & 0xf); - ShuffleMask.push_back(Index); - } + unsigned Base = i & ~0xf; + + // Only the least significant 4 bits of the byte are used. + int Index = Base + (Element & 0xf); + ShuffleMask.push_back(Index); } } - - assert(NumBytes == (int)ShuffleMask.size() && "Unexpected shuffle mask size"); } void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, SmallVectorImpl<int> &ShuffleMask) { Type *MaskTy = C->getType(); - // It is not an error for the PSHUFB mask to not be a vector of i8 because the - // constant pool uniques constants by their bit representation. - // e.g. the following take up the same space in the constant pool: - // i128 -170141183420855150465331762880109871104 - // - // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> - // - // <4 x i32> <i32 -2147483648, i32 -2147483648, - // i32 -2147483648, i32 -2147483648> - - if (ElSize != 32 && ElSize != 64) - return; - unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - if (MaskTySize != 128 && MaskTySize != 256 && MaskTySize != 512) - return; - - // Only support vector types. - if (!MaskTy->isVectorTy()) + (void)MaskTySize; + assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) && + "Unexpected vector size."); + assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size."); + + // The shuffle mask requires elements the same size as the target. + SmallBitVector UndefElts; + SmallVector<uint64_t, 8> RawMask; + if (!extractConstantMask(C, ElSize, UndefElts, RawMask)) return; - // Make sure its an integer type. - Type *VecEltTy = MaskTy->getVectorElementType(); - if (!VecEltTy->isIntegerTy()) - return; - - // Support any element type from byte up to element size. - // This is necessary primarily because 64-bit elements get split to 32-bit - // in the constant pool on 32-bit target. - unsigned EltTySize = VecEltTy->getIntegerBitWidth(); - if (EltTySize < 8 || EltTySize > ElSize) - return; - - unsigned NumElements = MaskTySize / ElSize; - assert((NumElements == 2 || NumElements == 4 || NumElements == 8 || - NumElements == 16) && + unsigned NumElts = RawMask.size(); + unsigned NumEltsPerLane = 128 / ElSize; + assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements."); - ShuffleMask.reserve(NumElements); - unsigned NumElementsPerLane = 128 / ElSize; - unsigned Factor = ElSize / EltTySize; - for (unsigned i = 0; i < NumElements; ++i) { - Constant *COp = C->getAggregateElement(i * Factor); - if (!COp) { - ShuffleMask.clear(); - return; - } else if (isa<UndefValue>(COp)) { + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { ShuffleMask.push_back(SM_SentinelUndef); continue; } - int Index = i & ~(NumElementsPerLane - 1); - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + + int Index = i & ~(NumEltsPerLane - 1); + uint64_t Element = RawMask[i]; if (ElSize == 64) Index += (Element >> 1) & 0x1; else Index += Element & 0x3; + ShuffleMask.push_back(Index); } - - // TODO: Handle funny-looking vectors too. } void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize, SmallVectorImpl<int> &ShuffleMask) { Type *MaskTy = C->getType(); - unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - if (MaskTySize != 128 && MaskTySize != 256) - return; + (void)MaskTySize; + assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size."); - // Only support vector types. - if (!MaskTy->isVectorTy()) + // The shuffle mask requires elements the same size as the target. + SmallBitVector UndefElts; + SmallVector<uint64_t, 8> RawMask; + if (!extractConstantMask(C, ElSize, UndefElts, RawMask)) return; - // Make sure its an integer type. - Type *VecEltTy = MaskTy->getVectorElementType(); - if (!VecEltTy->isIntegerTy()) - return; - - // Support any element type from byte up to element size. - // This is necessary primarily because 64-bit elements get split to 32-bit - // in the constant pool on 32-bit target. - unsigned EltTySize = VecEltTy->getIntegerBitWidth(); - if (EltTySize < 8 || EltTySize > ElSize) - return; - - unsigned NumElements = MaskTySize / ElSize; - assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + unsigned NumElts = RawMask.size(); + unsigned NumEltsPerLane = 128 / ElSize; + assert((NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected number of vector elements."); - ShuffleMask.reserve(NumElements); - unsigned NumElementsPerLane = 128 / ElSize; - unsigned Factor = ElSize / EltTySize; - for (unsigned i = 0; i < NumElements; ++i) { - Constant *COp = C->getAggregateElement(i * Factor); - if (!COp) { - ShuffleMask.clear(); - return; - } else if (isa<UndefValue>(COp)) { + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { ShuffleMask.push_back(SM_SentinelUndef); continue; } @@ -201,7 +200,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize, // Bits[3] - Match Bit. // Bits[2:1] - (Per Lane) PD Shuffle Mask. // Bits[2:0] - (Per Lane) PS Shuffle Mask. - uint64_t Selector = cast<ConstantInt>(COp)->getZExtValue(); + uint64_t Selector = RawMask[i]; unsigned MatchBit = (Selector >> 3) & 0x1; // M2Z[0:1] MatchBit @@ -215,51 +214,34 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize, continue; } - int Index = i & ~(NumElementsPerLane - 1); + int Index = i & ~(NumEltsPerLane - 1); if (ElSize == 64) Index += (Selector >> 1) & 0x1; else Index += Selector & 0x3; int Src = (Selector >> 2) & 0x1; - Index += Src * NumElements; + Index += Src * NumElts; ShuffleMask.push_back(Index); } - - // TODO: Handle funny-looking vectors too. } void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - assert(MaskTy->getPrimitiveSizeInBits() == 128); - - // Only support vector types. - if (!MaskTy->isVectorTy()) - return; - - // Make sure its an integer type. - Type *VecEltTy = MaskTy->getVectorElementType(); - if (!VecEltTy->isIntegerTy()) - return; + assert(C->getType()->getPrimitiveSizeInBits() == 128 && + "Unexpected vector size."); - // The shuffle mask requires a byte vector - decode cases with - // wider elements as well. - unsigned BitWidth = cast<IntegerType>(VecEltTy)->getBitWidth(); - if ((BitWidth % 8) != 0) + // The shuffle mask requires a byte vector. + SmallBitVector UndefElts; + SmallVector<uint64_t, 32> RawMask; + if (!extractConstantMask(C, 8, UndefElts, RawMask)) return; - int NumElts = MaskTy->getVectorNumElements(); - int Scale = BitWidth / 8; - int NumBytes = NumElts * Scale; - ShuffleMask.reserve(NumBytes); + unsigned NumElts = RawMask.size(); + assert(NumElts == 16 && "Unexpected number of vector elements."); - for (int i = 0; i != NumElts; ++i) { - Constant *COp = C->getAggregateElement(i); - if (!COp) { - ShuffleMask.clear(); - return; - } else if (isa<UndefValue>(COp)) { - ShuffleMask.append(Scale, SM_SentinelUndef); + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { + ShuffleMask.push_back(SM_SentinelUndef); continue; } @@ -275,82 +257,77 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { // 4 - 00h (zero - fill). // 5 - FFh (ones - fill). // 6 - Most significant bit of source byte replicated in all bit positions. - // 7 - Invert most significant bit of source byte and replicate in all bit positions. - APInt MaskElt = cast<ConstantInt>(COp)->getValue(); - for (int j = 0; j != Scale; ++j) { - APInt Index = MaskElt.getLoBits(5); - APInt PermuteOp = MaskElt.lshr(5).getLoBits(3); - MaskElt = MaskElt.lshr(8); - - if (PermuteOp == 4) { - ShuffleMask.push_back(SM_SentinelZero); - continue; - } - if (PermuteOp != 0) { - ShuffleMask.clear(); - return; - } - ShuffleMask.push_back((int)Index.getZExtValue()); + // 7 - Invert most significant bit of source byte and replicate in all bit + // positions. + uint64_t Element = RawMask[i]; + uint64_t Index = Element & 0x1F; + uint64_t PermuteOp = (Element >> 5) & 0x7; + + if (PermuteOp == 4) { + ShuffleMask.push_back(SM_SentinelZero); + continue; + } + if (PermuteOp != 0) { + ShuffleMask.clear(); + return; } + ShuffleMask.push_back((int)Index); } - - assert(NumBytes == (int)ShuffleMask.size() && "Unexpected shuffle mask size"); } -void DecodeVPERMVMask(const Constant *C, MVT VT, +void DecodeVPERMVMask(const Constant *C, unsigned ElSize, SmallVectorImpl<int> &ShuffleMask) { Type *MaskTy = C->getType(); - if (MaskTy->isVectorTy()) { - unsigned NumElements = MaskTy->getVectorNumElements(); - if (NumElements == VT.getVectorNumElements()) { - unsigned EltMaskSize = Log2_64(NumElements); - for (unsigned i = 0; i < NumElements; ++i) { - Constant *COp = C->getAggregateElement(i); - if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) { - ShuffleMask.clear(); - return; - } - if (isa<UndefValue>(COp)) - ShuffleMask.push_back(SM_SentinelUndef); - else { - APInt Element = cast<ConstantInt>(COp)->getValue(); - Element = Element.getLoBits(EltMaskSize); - ShuffleMask.push_back(Element.getZExtValue()); - } - } - } + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + (void)MaskTySize; + assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) && + "Unexpected vector size."); + assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) && + "Unexpected vector element size."); + + // The shuffle mask requires elements the same size as the target. + SmallBitVector UndefElts; + SmallVector<uint64_t, 8> RawMask; + if (!extractConstantMask(C, ElSize, UndefElts, RawMask)) return; + + unsigned NumElts = RawMask.size(); + + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + int Index = RawMask[i] & (NumElts - 1); + ShuffleMask.push_back(Index); } - // Scalar value; just broadcast it - if (!isa<ConstantInt>(C)) - return; - uint64_t Element = cast<ConstantInt>(C)->getZExtValue(); - int NumElements = VT.getVectorNumElements(); - Element &= (1 << NumElements) - 1; - for (int i = 0; i < NumElements; ++i) - ShuffleMask.push_back(Element); } -void DecodeVPERMV3Mask(const Constant *C, MVT VT, +void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, SmallVectorImpl<int> &ShuffleMask) { Type *MaskTy = C->getType(); - unsigned NumElements = MaskTy->getVectorNumElements(); - if (NumElements == VT.getVectorNumElements()) { - unsigned EltMaskSize = Log2_64(NumElements * 2); - for (unsigned i = 0; i < NumElements; ++i) { - Constant *COp = C->getAggregateElement(i); - if (!COp) { - ShuffleMask.clear(); - return; - } - if (isa<UndefValue>(COp)) - ShuffleMask.push_back(SM_SentinelUndef); - else { - APInt Element = cast<ConstantInt>(COp)->getValue(); - Element = Element.getLoBits(EltMaskSize); - ShuffleMask.push_back(Element.getZExtValue()); - } + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + (void)MaskTySize; + assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) && + "Unexpected vector size."); + assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) && + "Unexpected vector element size."); + + // The shuffle mask requires elements the same size as the target. + SmallBitVector UndefElts; + SmallVector<uint64_t, 8> RawMask; + if (!extractConstantMask(C, ElSize, UndefElts, RawMask)) + return; + + unsigned NumElts = RawMask.size(); + + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; } + int Index = RawMask[i] & (NumElts*2 - 1); + ShuffleMask.push_back(Index); } } } // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h index d2565b8..b703cbb 100644 --- a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h +++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h @@ -40,11 +40,11 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize, void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); /// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant. -void DecodeVPERMVMask(const Constant *C, MVT VT, +void DecodeVPERMVMask(const Constant *C, unsigned ElSize, SmallVectorImpl<int> &ShuffleMask); /// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant. -void DecodeVPERMV3Mask(const Constant *C, MVT VT, +void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, SmallVectorImpl<int> &ShuffleMask); } // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index 8f77682..586bb7b 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -92,6 +92,10 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV, if (TM.getCodeModel() == CodeModel::Large) return X86II::MO_NO_FLAG; + // Absolute symbols can be referenced directly. + if (GV && GV->isAbsoluteSymbolRef()) + return X86II::MO_NO_FLAG; + if (TM.shouldAssumeDSOLocal(M, GV)) return classifyLocalReference(GV); @@ -275,6 +279,7 @@ void X86Subtarget::initializeEnvironment() { HasMWAITX = false; HasMPX = false; IsBTMemSlow = false; + IsPMULLDSlow = false; IsSHLDSlow = false; IsUAMem16Slow = false; IsUAMem32Slow = false; @@ -282,6 +287,9 @@ void X86Subtarget::initializeEnvironment() { HasCmpxchg16b = false; UseLeaForSP = false; HasFastPartialYMMWrite = false; + HasFastScalarFSQRT = false; + HasFastVectorFSQRT = false; + HasFastLZCNT = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; @@ -328,6 +336,26 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, setPICStyle(PICStyles::GOT); } +const CallLowering *X86Subtarget::getCallLowering() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getCallLowering(); +} + +const InstructionSelector *X86Subtarget::getInstructionSelector() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getInstructionSelector(); +} + +const LegalizerInfo *X86Subtarget::getLegalizerInfo() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getLegalizerInfo(); +} + +const RegisterBankInfo *X86Subtarget::getRegBankInfo() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getRegBankInfo(); +} + bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; } diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index a274b79..d80dc4a 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -19,6 +19,7 @@ #include "X86InstrInfo.h" #include "X86SelectionDAGInfo.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/IR/CallingConv.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -177,6 +178,10 @@ protected: /// True if SHLD instructions are slow. bool IsSHLDSlow; + /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and + // PMULUDQ. + bool IsPMULLDSlow; + /// True if unaligned memory accesses of 16-bytes are slow. bool IsUAMem16Slow; @@ -199,14 +204,25 @@ protected: /// of a YMM register without clearing the upper part. bool HasFastPartialYMMWrite; + /// True if hardware SQRTSS instruction is at least as fast (latency) as + /// RSQRTSS followed by a Newton-Raphson iteration. + bool HasFastScalarFSQRT; + + /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast + /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration. + bool HasFastVectorFSQRT; + /// True if 8-bit divisions are significantly faster than /// 32-bit divisions and should be used when possible. bool HasSlowDivide32; - /// True if 16-bit divides are significantly faster than + /// True if 32-bit divides are significantly faster than /// 64-bit divisions and should be used when possible. bool HasSlowDivide64; + /// True if LZCNT instruction is fast. + bool HasFastLZCNT; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -287,6 +303,10 @@ protected: /// Instruction itineraries for scheduling InstrItineraryData InstrItins; + /// Gather the accessor points to GlobalISel-related APIs. + /// This is used to avoid ifndefs spreading around while GISel is + /// an optional library. + std::unique_ptr<GISelAccessor> GISel; private: /// Override the stack alignment. @@ -315,6 +335,9 @@ public: X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride); + /// This object will take onwership of \p GISelAccessor. + void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); } + const X86TargetLowering *getTargetLowering() const override { return &TLInfo; } @@ -342,6 +365,11 @@ public: /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + /// Methods used by Global ISel + const CallLowering *getCallLowering() const override; + const InstructionSelector *getInstructionSelector() const override; + const LegalizerInfo *getLegalizerInfo() const override; + const RegisterBankInfo *getRegBankInfo() const override; private: /// Initialize the full set of dependencies so we can use an initializer /// list for X86Subtarget. @@ -428,12 +456,16 @@ public: bool hasMWAITX() const { return HasMWAITX; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } + bool isPMULLDSlow() const { return IsPMULLDSlow; } bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; } + bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } + bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } + bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } @@ -450,6 +482,8 @@ public: bool hasPKU() const { return HasPKU; } bool hasMPX() const { return HasMPX; } + virtual bool isXRaySupported() const override { return is64Bit(); } + bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } bool useSoftFloat() const { return UseSoftFloat; } @@ -465,7 +499,7 @@ public: bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } - bool isTargetPS4() const { return TargetTriple.isPS4(); } + bool isTargetPS4() const { return TargetTriple.isPS4CPU(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index 50c9c25..aa5cfc6 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -13,8 +13,12 @@ #include "X86TargetMachine.h" #include "X86.h" +#include "X86CallLowering.h" #include "X86TargetObjectFile.h" #include "X86TargetTransformInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" @@ -35,12 +39,14 @@ void initializeWinEHStatePassPass(PassRegistry &); extern "C" void LLVMInitializeX86Target() { // Register the target. - RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target); - RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target); + RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target()); + RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target()); PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeGlobalISel(PR); initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); + initializeEvexToVexInstPassPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -50,8 +56,12 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return make_unique<TargetLoweringObjectFileMachO>(); } + if (TT.isOSFreeBSD()) + return make_unique<X86FreeBSDTargetObjectFile>(); if (TT.isOSLinux() || TT.isOSNaCl()) return make_unique<X86LinuxNaClTargetObjectFile>(); + if (TT.isOSFuchsia()) + return make_unique<X86FuchsiaTargetObjectFile>(); if (TT.isOSBinFormatELF()) return make_unique<X86ELFTargetObjectFile>(); if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment()) @@ -151,32 +161,47 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, CodeModel::Model CM, CodeGenOpt::Level OL) : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM, OL), - TLOF(createTLOF(getTargetTriple())), - Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { + TLOF(createTLOF(getTargetTriple())) { // Windows stack unwinder gets confused when execution flow "falls through" // after a call to 'noreturn' function. // To prevent that, we emit a trap for 'unreachable' IR instructions. // (which on X86, happens to be the 'ud2' instruction) // On PS4, the "return address" of a 'noreturn' call must still be within // the calling function, and TrapUnreachable is an easy way to get that. - if (Subtarget.isTargetWin64() || Subtarget.isTargetPS4()) + // The check here for 64-bit windows is a bit icky, but as we're unlikely + // to ever want to mix 32 and 64-bit windows code in a single module + // this should be fine. + if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4()) this->Options.TrapUnreachable = true; - // By default (and when -ffast-math is on), enable estimate codegen for - // everything except scalar division. By default, use 1 refinement step for - // all operations. Defaults may be overridden by using command-line options. - // Scalar division estimates are disabled because they break too much - // real-world code. These defaults match GCC behavior. - this->Options.Reciprocals.setDefaults("sqrtf", true, 1); - this->Options.Reciprocals.setDefaults("divf", false, 1); - this->Options.Reciprocals.setDefaults("vec-sqrtf", true, 1); - this->Options.Reciprocals.setDefaults("vec-divf", true, 1); - initAsmInfo(); } X86TargetMachine::~X86TargetMachine() {} +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { +struct X86GISelActualAccessor : public GISelAccessor { + std::unique_ptr<CallLowering> CL; + X86GISelActualAccessor(CallLowering* CL): CL(CL) {} + const CallLowering *getCallLowering() const override { + return CL.get(); + } + const InstructionSelector *getInstructionSelector() const override { + //TODO: Implement + return nullptr; + } + const LegalizerInfo *getLegalizerInfo() const override { + //TODO: Implement + return nullptr; + } + const RegisterBankInfo *getRegBankInfo() const override { + //TODO: Implement + return nullptr; + } +}; +} // End anonymous namespace. +#endif const X86Subtarget * X86TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); @@ -216,6 +241,13 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { resetTargetOptions(F); I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, Options.StackAlignmentOverride); +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + X86GISelActualAccessor *GISel = new X86GISelActualAccessor( + new X86CallLowering(*I->getTargetLowering())); +#endif + I->setGISelAccessor(*GISel); } return I.get(); } @@ -254,9 +286,22 @@ public: return getTM<X86TargetMachine>(); } + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override { + ScheduleDAGMILive *DAG = createGenericSchedLive(C); + DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + return DAG; + } + void addIRPasses() override; bool addInstSelector() override; - bool addILPOpts() override; +#ifdef LLVM_BUILD_GLOBAL_ISEL + bool addIRTranslator() override; + bool addLegalizeMachineIR() override; + bool addRegBankSelect() override; + bool addGlobalInstructionSelect() override; +#endif +bool addILPOpts() override; bool addPreISel() override; void addPreRegAlloc() override; void addPostRegAlloc() override; @@ -273,6 +318,9 @@ void X86PassConfig::addIRPasses() { addPass(createAtomicExpandPass(&getX86TargetMachine())); TargetPassConfig::addIRPasses(); + + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createInterleavedAccessPass(TM)); } bool X86PassConfig::addInstSelector() { @@ -288,6 +336,28 @@ bool X86PassConfig::addInstSelector() { return false; } +#ifdef LLVM_BUILD_GLOBAL_ISEL +bool X86PassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} + +bool X86PassConfig::addLegalizeMachineIR() { + //TODO: Implement + return false; +} + +bool X86PassConfig::addRegBankSelect() { + //TODO: Implement + return false; +} + +bool X86PassConfig::addGlobalInstructionSelect() { + //TODO: Implement + return false; +} +#endif + bool X86PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); if (EnableMachineCombinerPass) @@ -321,7 +391,7 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } void X86PassConfig::addPreEmitPass() { if (getOptLevel() != CodeGenOpt::None) - addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); + addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass)); if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); @@ -330,5 +400,6 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86FixupBWInsts()); addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); + addPass(createX86EvexToVexInsts()); } } diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h index 4734a44..d756d07 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.h +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -24,8 +24,6 @@ class StringRef; class X86TargetMachine final : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - X86Subtarget Subtarget; - mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; public: diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp index d664cff..7f70829 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -24,14 +24,13 @@ using namespace llvm; using namespace dwarf; const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference( - const GlobalValue *GV, unsigned Encoding, Mangler &Mang, - const TargetMachine &TM, MachineModuleInfo *MMI, - MCStreamer &Streamer) const { + const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM, + MachineModuleInfo *MMI, MCStreamer &Streamer) const { // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which // is an indirect pc-relative reference. if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) { - const MCSymbol *Sym = TM.getSymbol(GV, Mang); + const MCSymbol *Sym = TM.getSymbol(GV); const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); const MCExpr *Four = MCConstantExpr::create(4, getContext()); @@ -39,13 +38,13 @@ const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference( } return TargetLoweringObjectFileMachO::getTTypeGlobalReference( - GV, Encoding, Mang, TM, MMI, Streamer); + GV, Encoding, TM, MMI, Streamer); } MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol( - const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM, + const GlobalValue *GV, const TargetMachine &TM, MachineModuleInfo *MMI) const { - return TM.getSymbol(GV, Mang); + return TM.getSymbol(GV); } const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel( @@ -67,6 +66,20 @@ const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol( } void +X86FreeBSDTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + +void +X86FuchsiaTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + +void X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); @@ -74,7 +87,7 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, } const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference( - const GlobalValue *LHS, const GlobalValue *RHS, Mangler &Mang, + const GlobalValue *LHS, const GlobalValue *RHS, const TargetMachine &TM) const { // Our symbols should exist in address space zero, cowardly no-op if // otherwise. @@ -95,8 +108,9 @@ const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference( cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection()) return nullptr; - return MCSymbolRefExpr::create( - TM.getSymbol(LHS, Mang), MCSymbolRefExpr::VK_COFF_IMGREL32, getContext()); + return MCSymbolRefExpr::create(TM.getSymbol(LHS), + MCSymbolRefExpr::VK_COFF_IMGREL32, + getContext()); } static std::string APIntToHexString(const APInt &AI) { diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h index 2e703f1..39d2e84 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -19,15 +19,15 @@ namespace llvm { /// x86-64. class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO { public: - const MCExpr * - getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding, - Mangler &Mang, const TargetMachine &TM, - MachineModuleInfo *MMI, - MCStreamer &Streamer) const override; + const MCExpr *getTTypeGlobalReference(const GlobalValue *GV, + unsigned Encoding, + const TargetMachine &TM, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; // getCFIPersonalitySymbol - The symbol that gets passed to // .cfi_personality. - MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang, + MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, const TargetMachine &TM, MachineModuleInfo *MMI) const override; @@ -49,6 +49,17 @@ namespace llvm { const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; }; + /// X86FreeBSDTargetObjectFile - This implementation is used for FreeBSD + /// on x86 and x86-64. + class X86FreeBSDTargetObjectFile : public X86ELFTargetObjectFile { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + }; + + /// \brief This implementation is used for Fuchsia on x86-64. + class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + }; + /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and /// Native Client on x86 and x86-64. class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile { @@ -59,7 +70,6 @@ namespace llvm { class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF { const MCExpr * lowerRelativeReference(const GlobalValue *LHS, const GlobalValue *RHS, - Mangler &Mang, const TargetMachine &TM) const override; /// \brief Given a mergeable constant with the specified size and relocation diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index f44a8c6..5715d82 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -13,6 +13,31 @@ /// independent and default TTI implementations handle the rest. /// //===----------------------------------------------------------------------===// +/// About Cost Model numbers used below it's necessary to say the following: +/// the numbers correspond to some "generic" X86 CPU instead of usage of +/// concrete CPU model. Usually the numbers correspond to CPU where the feature +/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in +/// the lookups below the cost is based on Nehalem as that was the first CPU +/// to support that feature level and thus has most likely the worst case cost. +/// Some examples of other technologies/CPUs: +/// SSE 3 - Pentium4 / Athlon64 +/// SSE 4.1 - Penryn +/// SSE 4.2 - Nehalem +/// AVX - Sandy Bridge +/// AVX2 - Haswell +/// AVX-512 - Xeon Phi / Skylake +/// And some examples of instruction target dependent costs (latency) +/// divss sqrtss rsqrtss +/// AMD K7 11-16 19 3 +/// Piledriver 9-24 13-15 5 +/// Jaguar 14 16 2 +/// Pentium II,III 18 30 2 +/// Nehalem 7-14 7-18 3 +/// Haswell 10-13 11 5 +/// TODO: Develop and implement the target dependent cost model and +/// specialize cost numbers for different Cost Model Targets such as throughput, +/// code size, latency and uop count. +//===----------------------------------------------------------------------===// #include "X86TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -55,9 +80,12 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { if (Vector) { - if (ST->hasAVX512()) return 512; - if (ST->hasAVX()) return 256; - if (ST->hasSSE1()) return 128; + if (ST->hasAVX512()) + return 512; + if (ST->hasAVX()) + return 256; + if (ST->hasSSE1()) + return 128; return 0; } @@ -86,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { } int X86TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, - TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo) { + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef<const Value *> Args) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + static const CostTblEntry SLMCostTable[] = { + { ISD::MUL, MVT::v4i32, 11 }, // pmulld + { ISD::MUL, MVT::v8i16, 2 }, // pmullw + { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. + { ISD::FMUL, MVT::f64, 2 }, // mulsd + { ISD::FMUL, MVT::v2f64, 4 }, // mulpd + { ISD::FMUL, MVT::v4f32, 2 }, // mulps + { ISD::FDIV, MVT::f32, 17 }, // divss + { ISD::FDIV, MVT::v4f32, 39 }, // divps + { ISD::FDIV, MVT::f64, 32 }, // divsd + { ISD::FDIV, MVT::v2f64, 69 }, // divpd + { ISD::FADD, MVT::v2f64, 2 }, // addpd + { ISD::FSUB, MVT::v2f64, 2 }, // subpd + // v2i64/v4i64 mul is custom lowered as a series of long + // multiplies(3), shifts(3) and adds(2). + // slm muldq version throughput is 2 + { ISD::MUL, MVT::v2i64, 11 }, + }; + + if (ST->isSLM()) { + if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { + // Check if the operands can be shrinked into a smaller datatype. + bool Op1Signed = false; + unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); + bool Op2Signed = false; + unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); + + bool signedMode = Op1Signed | Op2Signed; + unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); + + if (OpMinSize <= 7) + return LT.first * 3; // pmullw/sext + if (!signedMode && OpMinSize <= 8) + return LT.first * 3; // pmullw/zext + if (OpMinSize <= 15) + return LT.first * 5; // pmullw/pmulhw/pshuf + if (!signedMode && OpMinSize <= 16) + return LT.first * 5; // pmullw/pmulhw/pshuf + } + if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, + LT.second)) { + return LT.first * Entry->Cost; + } + } + if (ISD == ISD::SDIV && Op2Info == TargetTransformInfo::OK_UniformConstantValue && Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { @@ -115,7 +190,39 @@ int X86TTIImpl::getArithmeticInstrCost( return Cost; } + static const CostTblEntry AVX512BWUniformConstCostTable[] = { + { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. + + { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence + { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasBWI()) { + if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX512UniformConstCostTable[] = { + { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence + { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasAVX512()) { + if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + static const CostTblEntry AVX2UniformConstCostTable[] = { + { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence @@ -131,21 +238,136 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; } + static const CostTblEntry SSE2UniformConstCostTable[] = { + { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + + { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand). + { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand). + { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb). + + { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasSSE2()) { + // pmuldq sequence. + if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) + return LT.first * 30; + if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) + return LT.first * 15; + + if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2UniformCostTable[] = { + // Uniform splats are cheaper for the following instructions. + { ISD::SHL, MVT::v16i16, 1 }, // psllw. + { ISD::SRL, MVT::v16i16, 1 }, // psrlw. + { ISD::SRA, MVT::v16i16, 1 }, // psraw. + }; + + if (ST->hasAVX2() && + ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || + (Op2Info == TargetTransformInfo::OK_UniformValue))) { + if (const auto *Entry = + CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry SSE2UniformCostTable[] = { + // Uniform splats are cheaper for the following instructions. + { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v2i64, 1 }, // psllq. + + { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + }; + + if (ST->hasSSE2() && + ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || + (Op2Info == TargetTransformInfo::OK_UniformValue))) { + if (const auto *Entry = + CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX512DQCostTable[] = { + { ISD::MUL, MVT::v2i64, 1 }, + { ISD::MUL, MVT::v4i64, 1 }, + { ISD::MUL, MVT::v8i64, 1 } + }; + + // Look for AVX512DQ lowering tricks for custom cases. + if (ST->hasDQI()) + if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX512BWCostTable[] = { + { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v32i16, 1 }, // vpsravw + + { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. + { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. + { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. + + { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v64i8, 64*20 }, + { ISD::SDIV, MVT::v32i16, 32*20 }, + { ISD::UDIV, MVT::v64i8, 64*20 }, + { ISD::UDIV, MVT::v32i16, 32*20 } + }; + + // Look for AVX512BW lowering tricks for custom cases. + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry AVX512CostTable[] = { - { ISD::SHL, MVT::v16i32, 1 }, - { ISD::SRL, MVT::v16i32, 1 }, - { ISD::SRA, MVT::v16i32, 1 }, - { ISD::SHL, MVT::v8i64, 1 }, - { ISD::SRL, MVT::v8i64, 1 }, - { ISD::SRA, MVT::v8i64, 1 }, + { ISD::SHL, MVT::v16i32, 1 }, + { ISD::SRL, MVT::v16i32, 1 }, + { ISD::SRA, MVT::v16i32, 1 }, + { ISD::SHL, MVT::v8i64, 1 }, + { ISD::SRL, MVT::v8i64, 1 }, + { ISD::SRA, MVT::v8i64, 1 }, + + { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i32, 1 }, // pmulld + { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v16i32, 16*20 }, + { ISD::SDIV, MVT::v8i64, 8*20 }, + { ISD::UDIV, MVT::v16i32, 16*20 }, + { ISD::UDIV, MVT::v8i64, 8*20 } }; - if (ST->hasAVX512()) { + if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } - static const CostTblEntry AVX2CostTable[] = { + static const CostTblEntry AVX2ShiftCostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. { ISD::SHL, MVT::v4i32, 1 }, @@ -169,11 +391,11 @@ int X86TTIImpl::getArithmeticInstrCost( // is lowered into a vector multiply (vpmullw). return LT.first; - if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) + if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; } - static const CostTblEntry XOPCostTable[] = { + static const CostTblEntry XOPShiftCostTable[] = { // 128bit shifts take 1cy, but right shifts require negation beforehand. { ISD::SHL, MVT::v16i8, 1 }, { ISD::SRL, MVT::v16i8, 2 }, @@ -203,87 +425,31 @@ int X86TTIImpl::getArithmeticInstrCost( }; // Look for XOP lowering tricks. - if (ST->hasXOP()) { - if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second)) - return LT.first * Entry->Cost; - } - - static const CostTblEntry AVX2CustomCostTable[] = { - { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. - { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - - { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. - { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - - { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. - { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. - { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. - { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. - - // Vectorizing division is a bad idea. See the SSE2 table for more comments. - { ISD::SDIV, MVT::v32i8, 32*20 }, - { ISD::SDIV, MVT::v16i16, 16*20 }, - { ISD::SDIV, MVT::v8i32, 8*20 }, - { ISD::SDIV, MVT::v4i64, 4*20 }, - { ISD::UDIV, MVT::v32i8, 32*20 }, - { ISD::UDIV, MVT::v16i16, 16*20 }, - { ISD::UDIV, MVT::v8i32, 8*20 }, - { ISD::UDIV, MVT::v4i64, 4*20 }, - }; - - // Look for AVX2 lowering tricks for custom cases. - if (ST->hasAVX2()) { - if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD, - LT.second)) + if (ST->hasXOP()) + if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } - static const CostTblEntry - SSE2UniformConstCostTable[] = { - // We don't correctly identify costs of casts because they are marked as - // custom. - // Constant splats are cheaper for the following instructions. - { ISD::SHL, MVT::v16i8, 1 }, // psllw. - { ISD::SHL, MVT::v32i8, 2 }, // psllw. - { ISD::SHL, MVT::v8i16, 1 }, // psllw. + static const CostTblEntry SSE2UniformShiftCostTable[] = { + // Uniform splats are cheaper for the following instructions. { ISD::SHL, MVT::v16i16, 2 }, // psllw. - { ISD::SHL, MVT::v4i32, 1 }, // pslld { ISD::SHL, MVT::v8i32, 2 }, // pslld - { ISD::SHL, MVT::v2i64, 1 }, // psllq. { ISD::SHL, MVT::v4i64, 2 }, // psllq. - { ISD::SRL, MVT::v16i8, 1 }, // psrlw. - { ISD::SRL, MVT::v32i8, 2 }, // psrlw. - { ISD::SRL, MVT::v8i16, 1 }, // psrlw. { ISD::SRL, MVT::v16i16, 2 }, // psrlw. - { ISD::SRL, MVT::v4i32, 1 }, // psrld. { ISD::SRL, MVT::v8i32, 2 }, // psrld. - { ISD::SRL, MVT::v2i64, 1 }, // psrlq. { ISD::SRL, MVT::v4i64, 2 }, // psrlq. - { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. - { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb. - { ISD::SRA, MVT::v8i16, 1 }, // psraw. { ISD::SRA, MVT::v16i16, 2 }, // psraw. - { ISD::SRA, MVT::v4i32, 1 }, // psrad. { ISD::SRA, MVT::v8i32, 2 }, // psrad. { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. - - { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence - { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence - { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence - { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; - if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && - ST->hasSSE2()) { - // pmuldq sequence. - if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) - return LT.first * 15; - - if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, - LT.second)) + if (ST->hasSSE2() && + ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || + (Op2Info == TargetTransformInfo::OK_UniformValue))) { + if (const auto *Entry = + CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; } @@ -291,60 +457,170 @@ int X86TTIImpl::getArithmeticInstrCost( Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { MVT VT = LT.second; // Vector shift left by non uniform constant can be lowered - // into vector multiply (pmullw/pmulld). - if ((VT == MVT::v8i16 && ST->hasSSE2()) || - (VT == MVT::v4i32 && ST->hasSSE41())) - return LT.first; - - // v16i16 and v8i32 shifts by non-uniform constants are lowered into a - // sequence of extract + two vector multiply + insert. - if ((VT == MVT::v8i32 || VT == MVT::v16i16) && - (ST->hasAVX() && !ST->hasAVX2())) - ISD = ISD::MUL; - - // A vector shift left by non uniform constant is converted - // into a vector multiply; the new multiply is eventually - // lowered into a sequence of shuffles and 2 x pmuludq. - if (VT == MVT::v4i32 && ST->hasSSE2()) + // into vector multiply. + if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || + ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) ISD = ISD::MUL; } + static const CostTblEntry AVX2CostTable[] = { + { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + + { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + + { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. + { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. + + { ISD::SUB, MVT::v32i8, 1 }, // psubb + { ISD::ADD, MVT::v32i8, 1 }, // paddb + { ISD::SUB, MVT::v16i16, 1 }, // psubw + { ISD::ADD, MVT::v16i16, 1 }, // paddw + { ISD::SUB, MVT::v8i32, 1 }, // psubd + { ISD::ADD, MVT::v8i32, 1 }, // paddd + { ISD::SUB, MVT::v4i64, 1 }, // psubq + { ISD::ADD, MVT::v4i64, 1 }, // paddq + + { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i16, 1 }, // pmullw + { ISD::MUL, MVT::v8i32, 1 }, // pmulld + { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add + + { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ + { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ + }; + + // Look for AVX2 lowering tricks for custom cases. + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry AVX1CostTable[] = { + // We don't have to scalarize unsupported ops. We can issue two half-sized + // operations and we only need to extract the upper YMM half. + // Two ops + 1 extract + 1 insert = 4. + { ISD::MUL, MVT::v16i16, 4 }, + { ISD::MUL, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v32i8, 4 }, + { ISD::ADD, MVT::v32i8, 4 }, + { ISD::SUB, MVT::v16i16, 4 }, + { ISD::ADD, MVT::v16i16, 4 }, + { ISD::SUB, MVT::v8i32, 4 }, + { ISD::ADD, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v4i64, 4 }, + { ISD::ADD, MVT::v4i64, 4 }, + + // A v4i64 multiply is custom lowered as two split v2i64 vectors that then + // are lowered as a series of long multiplies(3), shifts(3) and adds(2) + // Because we believe v4i64 to be a legal type, we must also include the + // extract+insert in the cost table. Therefore, the cost here is 18 + // instead of 8. + { ISD::MUL, MVT::v4i64, 18 }, + + { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. + + { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ + { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v32i8, 32*20 }, + { ISD::SDIV, MVT::v16i16, 16*20 }, + { ISD::SDIV, MVT::v8i32, 8*20 }, + { ISD::SDIV, MVT::v4i64, 4*20 }, + { ISD::UDIV, MVT::v32i8, 32*20 }, + { ISD::UDIV, MVT::v16i16, 16*20 }, + { ISD::UDIV, MVT::v8i32, 8*20 }, + { ISD::UDIV, MVT::v4i64, 4*20 }, + }; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSE42CostTable[] = { + { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ + }; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSE41CostTable[] = { + { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. + { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence. + { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence. + { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld + { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld + + { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. + { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence. + { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence. + { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. + { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend. + + { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. + { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence. + { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence. + { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. + { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend. + + { ISD::MUL, MVT::v4i32, 1 } // pmulld + }; + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. - // For some cases, where the shift amount is a scalar we would be able - // to generate better code. Unfortunately, when this is the case the value - // (the splat) will get hoisted out of the loop, thereby making it invisible - // to ISel. The cost model must return worst case assumptions because it is - // used for vectorization and we don't want to make vectorized code worse - // than scalar code. { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend. { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. - { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence. { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend. { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. + { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v8i16, 1 }, // pmullw + { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle + { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add + + { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ + // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular // registers. The overhead of division is going to dominate most kernels @@ -352,61 +628,27 @@ int X86TTIImpl::getArithmeticInstrCost( // generally a bad idea. Assume somewhat arbitrarily that we have to be able // to hide "20 cycles" for each lane. { ISD::SDIV, MVT::v16i8, 16*20 }, - { ISD::SDIV, MVT::v8i16, 8*20 }, - { ISD::SDIV, MVT::v4i32, 4*20 }, - { ISD::SDIV, MVT::v2i64, 2*20 }, + { ISD::SDIV, MVT::v8i16, 8*20 }, + { ISD::SDIV, MVT::v4i32, 4*20 }, + { ISD::SDIV, MVT::v2i64, 2*20 }, { ISD::UDIV, MVT::v16i8, 16*20 }, - { ISD::UDIV, MVT::v8i16, 8*20 }, - { ISD::UDIV, MVT::v4i32, 4*20 }, - { ISD::UDIV, MVT::v2i64, 2*20 }, + { ISD::UDIV, MVT::v8i16, 8*20 }, + { ISD::UDIV, MVT::v4i32, 4*20 }, + { ISD::UDIV, MVT::v2i64, 2*20 }, }; - if (ST->hasSSE2()) { + if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } - static const CostTblEntry AVX1CostTable[] = { - // We don't have to scalarize unsupported ops. We can issue two half-sized - // operations and we only need to extract the upper YMM half. - // Two ops + 1 extract + 1 insert = 4. - { ISD::MUL, MVT::v16i16, 4 }, - { ISD::MUL, MVT::v8i32, 4 }, - { ISD::SUB, MVT::v8i32, 4 }, - { ISD::ADD, MVT::v8i32, 4 }, - { ISD::SUB, MVT::v4i64, 4 }, - { ISD::ADD, MVT::v4i64, 4 }, - // A v4i64 multiply is custom lowered as two split v2i64 vectors that then - // are lowered as a series of long multiplies(3), shifts(4) and adds(2) - // Because we believe v4i64 to be a legal type, we must also include the - // split factor of two in the cost table. Therefore, the cost here is 18 - // instead of 9. - { ISD::MUL, MVT::v4i64, 18 }, + static const CostTblEntry SSE1CostTable[] = { + { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ }; - // Look for AVX1 lowering tricks. - if (ST->hasAVX() && !ST->hasAVX2()) { - MVT VT = LT.second; - - if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT)) + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } - - // Custom lowering of vectors. - static const CostTblEntry CustomLowered[] = { - // A v2i64/v4i64 and multiply is custom lowered as a series of long - // multiplies(3), shifts(4) and adds(2). - { ISD::MUL, MVT::v2i64, 9 }, - { ISD::MUL, MVT::v4i64, 9 }, - }; - if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second)) - return LT.first * Entry->Cost; - - // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, - // 2x pmuludq, 2x shuffle. - if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && - !ST->hasSSE41()) - return LT.first * 6; // Fallback to the default implementation. return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); @@ -414,112 +656,252 @@ int X86TTIImpl::getArithmeticInstrCost( int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - // We only estimate the cost of reverse and alternate shuffles. - if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + // 64-bit packed float vectors (v2f32) are widened to type v4f32. + // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + // For Broadcasts we are splatting the first element from the first input + // register, so only need to reference that input and all the output + // registers are the same. + if (Kind == TTI::SK_Broadcast) + LT.first = 1; + + // We are going to permute multiple sources and the result will be in multiple + // destinations. Providing an accurate cost only for splits where the element + // type remains the same. + if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { + MVT LegalVT = LT.second; + if (LegalVT.getVectorElementType().getSizeInBits() == + Tp->getVectorElementType()->getPrimitiveSizeInBits() && + LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { + + unsigned VecTySize = DL.getTypeStoreSize(Tp); + unsigned LegalVTSize = LegalVT.getStoreSize(); + // Number of source vectors after legalization: + unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; + // Number of destination vectors after legalization: + unsigned NumOfDests = LT.first; + + Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), + LegalVT.getVectorNumElements()); + + unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; + return NumOfShuffles * + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); + } - if (Kind == TTI::SK_Reverse) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - int Cost = 1; - if (LT.second.getSizeInBits() > 128) - Cost = 3; // Extract + insert + copy. + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + } - // Multiple by the number of parts. - return Cost * LT.first; + // For 2-input shuffles, we must account for splitting the 2 inputs into many. + if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { + // We assume that source and destination have the same vector type. + int NumOfDests = LT.first; + int NumOfShufflesPerDest = LT.first * 2 - 1; + LT.first = NumOfDests * NumOfShufflesPerDest; } - if (Kind == TTI::SK_Alternate) { - // 64-bit packed float vectors (v2f32) are widened to type v4f32. - // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb + { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb - // The backend knows how to generate a single VEX.256 version of - // instruction VPBLENDW if the target supports AVX2. - if (ST->hasAVX2() && LT.second == MVT::v16i16) - return LT.first; + { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb - static const CostTblEntry AVXAltShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd - {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd + { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b + { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b + { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b + }; + + if (ST->hasVBMI()) + if (const auto *Entry = + CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps - {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps + static const CostTblEntry AVX512BWShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb + + { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw + { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw + { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2 + + { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw + { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw + { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw + { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16 + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc + + { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w + { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w + { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w + { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc + { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1 + { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc + }; - // This shuffle is custom lowered into a sequence of: - // 2x vextractf128 , 2x vpblendw , 1x vinsertf128 - {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5}, + if (ST->hasBWI()) + if (const auto *Entry = + CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - // This shuffle is custom lowered into a long sequence of: - // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128 - {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} - }; + static const CostTblEntry AVX512ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd + { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps + { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq + { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd + + { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd + { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps + { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq + { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd + + { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd + { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd + { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd + { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps + { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps + { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps + { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq + { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq + { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq + { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd + { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd + { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd + { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb + + { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd + { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps + { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q + { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d + { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd + { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps + { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q + { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d + { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd + { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps + { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q + { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d + }; - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) - return LT.first * Entry->Cost; + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry SSE41AltShuffleTbl[] = { - // These are lowered into movsd. - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + static const CostTblEntry AVX2ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd + { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps + { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq + { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd + { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb + + { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd + { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps + { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq + { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd + { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb + { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb + + { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw + { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb + }; - // packed float vectors with four elements are lowered into BLENDI dag - // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'. - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - // This shuffle generates a single pshufw. - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + static const CostTblEntry AVX1ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128 + { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128 + + { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb + // + vinsertf128 + { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb + // + vinsertf128 + + { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd + { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd + { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps + { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps + { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor + { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor + }; - // There is no instruction that matches a v16i8 alternate shuffle. - // The backend will expand it into the sequence 'pshufb + pshufb + or'. - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} - }; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - if (ST->hasSSE41()) - if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, - LT.second)) - return LT.first * Entry->Cost; + static const CostTblEntry SSE41ShuffleTbl[] = { + { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw + { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd + { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw + { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps + { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw + { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb + }; - static const CostTblEntry SSSE3AltShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - // SSE3 doesn't have 'blendps'. The following shuffles are expanded into - // the sequence 'shufps + pshufd' - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + static const CostTblEntry SSSE3ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb + { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or - }; + { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb + { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb - if (ST->hasSSSE3()) - if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) - return LT.first * Entry->Cost; + { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por + { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por + }; - static const CostTblEntry SSEAltShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd + static const CostTblEntry SSE2ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd + { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd + { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd + { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd + { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd + + { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd + { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd + { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd + { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd + { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw + // + 2*pshufd + 2*unpck + packus + + { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd + { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd + { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps + { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por + { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por + }; - // This is expanded into a long sequence of four extract + four insert. - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - // 8 x (pinsrw + pextrw + and + movb + movzb + or) - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} - }; + static const CostTblEntry SSE1ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps + { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps + { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps + }; - // Fall-back (SSE3 and SSE2). - if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } @@ -532,6 +914,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { // potential massive combinations (elem_num x src_type x dst_type). static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, @@ -539,12 +928,19 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, - { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, - { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, - { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, - { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, + { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, + + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, }; // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and @@ -779,6 +1175,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, @@ -945,6 +1343,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF) { + // Costs should match the codegen from: + // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll + // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll + // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll + // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll + // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll static const CostTblEntry XOPCostTbl[] = { { ISD::BITREVERSE, MVT::v4i64, 4 }, { ISD::BITREVERSE, MVT::v8i32, 4 }, @@ -966,7 +1370,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::BITREVERSE, MVT::v32i8, 5 }, { ISD::BSWAP, MVT::v4i64, 1 }, { ISD::BSWAP, MVT::v8i32, 1 }, - { ISD::BSWAP, MVT::v16i16, 1 } + { ISD::BSWAP, MVT::v16i16, 1 }, + { ISD::CTLZ, MVT::v4i64, 23 }, + { ISD::CTLZ, MVT::v8i32, 18 }, + { ISD::CTLZ, MVT::v16i16, 14 }, + { ISD::CTLZ, MVT::v32i8, 9 }, + { ISD::CTPOP, MVT::v4i64, 7 }, + { ISD::CTPOP, MVT::v8i32, 11 }, + { ISD::CTPOP, MVT::v16i16, 9 }, + { ISD::CTPOP, MVT::v32i8, 6 }, + { ISD::CTTZ, MVT::v4i64, 10 }, + { ISD::CTTZ, MVT::v8i32, 14 }, + { ISD::CTTZ, MVT::v16i16, 12 }, + { ISD::CTTZ, MVT::v32i8, 9 }, + { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ + { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ + { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ + { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ + { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ + { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ }; static const CostTblEntry AVX1CostTbl[] = { { ISD::BITREVERSE, MVT::v4i64, 10 }, @@ -975,7 +1397,29 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::BITREVERSE, MVT::v32i8, 10 }, { ISD::BSWAP, MVT::v4i64, 4 }, { ISD::BSWAP, MVT::v8i32, 4 }, - { ISD::BSWAP, MVT::v16i16, 4 } + { ISD::BSWAP, MVT::v16i16, 4 }, + { ISD::CTLZ, MVT::v4i64, 46 }, + { ISD::CTLZ, MVT::v8i32, 36 }, + { ISD::CTLZ, MVT::v16i16, 28 }, + { ISD::CTLZ, MVT::v32i8, 18 }, + { ISD::CTPOP, MVT::v4i64, 14 }, + { ISD::CTPOP, MVT::v8i32, 22 }, + { ISD::CTPOP, MVT::v16i16, 18 }, + { ISD::CTPOP, MVT::v32i8, 12 }, + { ISD::CTTZ, MVT::v4i64, 20 }, + { ISD::CTTZ, MVT::v8i32, 28 }, + { ISD::CTTZ, MVT::v16i16, 24 }, + { ISD::CTTZ, MVT::v32i8, 18 }, + { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ + { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ + { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ + { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ + { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ + { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ + }; + static const CostTblEntry SSE42CostTbl[] = { + { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ + { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ }; static const CostTblEntry SSSE3CostTbl[] = { { ISD::BITREVERSE, MVT::v2i64, 5 }, @@ -984,12 +1428,42 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::BITREVERSE, MVT::v16i8, 5 }, { ISD::BSWAP, MVT::v2i64, 1 }, { ISD::BSWAP, MVT::v4i32, 1 }, - { ISD::BSWAP, MVT::v8i16, 1 } + { ISD::BSWAP, MVT::v8i16, 1 }, + { ISD::CTLZ, MVT::v2i64, 23 }, + { ISD::CTLZ, MVT::v4i32, 18 }, + { ISD::CTLZ, MVT::v8i16, 14 }, + { ISD::CTLZ, MVT::v16i8, 9 }, + { ISD::CTPOP, MVT::v2i64, 7 }, + { ISD::CTPOP, MVT::v4i32, 11 }, + { ISD::CTPOP, MVT::v8i16, 9 }, + { ISD::CTPOP, MVT::v16i8, 6 }, + { ISD::CTTZ, MVT::v2i64, 10 }, + { ISD::CTTZ, MVT::v4i32, 14 }, + { ISD::CTTZ, MVT::v8i16, 12 }, + { ISD::CTTZ, MVT::v16i8, 9 } }; static const CostTblEntry SSE2CostTbl[] = { { ISD::BSWAP, MVT::v2i64, 7 }, { ISD::BSWAP, MVT::v4i32, 7 }, - { ISD::BSWAP, MVT::v8i16, 7 } + { ISD::BSWAP, MVT::v8i16, 7 }, + { ISD::CTLZ, MVT::v2i64, 25 }, + { ISD::CTLZ, MVT::v4i32, 26 }, + { ISD::CTLZ, MVT::v8i16, 20 }, + { ISD::CTLZ, MVT::v16i8, 17 }, + { ISD::CTPOP, MVT::v2i64, 12 }, + { ISD::CTPOP, MVT::v4i32, 15 }, + { ISD::CTPOP, MVT::v8i16, 13 }, + { ISD::CTPOP, MVT::v16i8, 10 }, + { ISD::CTTZ, MVT::v2i64, 14 }, + { ISD::CTTZ, MVT::v4i32, 18 }, + { ISD::CTTZ, MVT::v8i16, 16 }, + { ISD::CTTZ, MVT::v16i8, 13 }, + { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ + { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ + }; + static const CostTblEntry SSE1CostTbl[] = { + { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ + { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; unsigned ISD = ISD::DELETED_NODE; @@ -1002,6 +1476,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, case Intrinsic::bswap: ISD = ISD::BSWAP; break; + case Intrinsic::ctlz: + ISD = ISD::CTLZ; + break; + case Intrinsic::ctpop: + ISD = ISD::CTPOP; + break; + case Intrinsic::cttz: + ISD = ISD::CTTZ; + break; + case Intrinsic::sqrt: + ISD = ISD::FSQRT; + break; } // Legalize the type. @@ -1021,6 +1507,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasSSSE3()) if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -1029,6 +1519,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) return LT.first * Entry->Cost; + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF); } @@ -1177,17 +1671,29 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, return Cost+LT.first; } -int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; - if (Ty->isVectorTy() && IsComplex) - return NumVectorInstToHideOverhead; + // Cost modeling of Strided Access Computation is hidden by the indexing + // modes of X86 regardless of the stride value. We dont believe that there + // is a difference between constant strided access in gerenal and constant + // strided value which is less than or equal to 64. + // Even in the case of (loop invariant) stride whose value is not known at + // compile time, the address computation will not incur more than one extra + // ADD instruction. + if (Ty->isVectorTy() && SE) { + if (!BaseT::isStridedAccess(Ptr)) + return NumVectorInstToHideOverhead; + if (!BaseT::getConstantStrideStep(SE, Ptr)) + return 1; + } - return BaseT::getAddressComputationCost(Ty, IsComplex); + return BaseT::getAddressComputationCost(Ty, SE, Ptr); } int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, @@ -1352,7 +1858,7 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, // immediates here as the normal path expects bit 31 to be sign extended. if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) return TTI::TCC_Free; - // Fallthrough + LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::Sub: case Instruction::Mul: @@ -1556,13 +2062,14 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, // Vector-4 of gather/scatter instruction does not exist on KNL. // We can extend it to 8 elements, but zeroing upper bits of // the mask vector will add more instructions. Right now we give the scalar - // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is - // better in the VariableMask case. + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction + // is better in the VariableMask case. if (VF == 2 || (VF == 4 && !ST->hasVLX())) Scalarize = true; if (Scalarize) - return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, + AddressSpace); return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } @@ -1572,8 +2079,8 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { int DataWidth = isa<PointerType>(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - return (DataWidth >= 32 && ST->hasAVX()) || - (DataWidth >= 8 && ST->hasBWI()); + return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) || + ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI()); } bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { @@ -1598,7 +2105,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); // AVX-512 allows gather and scatter - return DataWidth >= 32 && ST->hasAVX512(); + return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512(); } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { @@ -1620,3 +2127,122 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, // correct. return (CallerBits & CalleeBits) == CalleeBits; } + +bool X86TTIImpl::enableInterleavedAccessVectorization() { + // TODO: We expect this to be beneficial regardless of arch, + // but there are currently some unexplained performance artifacts on Atom. + // As a temporary solution, disable on Atom. + return !(ST->isAtom() || ST->isSLM()); +} + +// Get estimation for interleaved load/store operations and strided load. +// \p Indices contains indices for strided load. +// \p Factor - the factor of interleaving. +// AVX-512 provides 3-src shuffles that significantly reduces the cost. +int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + + // VecTy for interleave memop is <VF*Factor x Elt>. + // So, for VF=4, Interleave Factor = 3, Element type = i32 we have + // VecTy = <12 x i32>. + + // Calculate the number of memory operations (NumOfMemOps), required + // for load/store the VecTy. + MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; + unsigned VecTySize = DL.getTypeStoreSize(VecTy); + unsigned LegalVTSize = LegalVT.getStoreSize(); + unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; + + // Get the cost of one memory operation. + Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), + LegalVT.getVectorNumElements()); + unsigned MemOpCost = + getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + + if (Opcode == Instruction::Load) { + // Kind of shuffle depends on number of loaded values. + // If we load the entire data in one register, we can use a 1-src shuffle. + // Otherwise, we'll merge 2 sources in each operation. + TTI::ShuffleKind ShuffleKind = + (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; + + unsigned ShuffleCost = + getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); + + unsigned NumOfLoadsInInterleaveGrp = + Indices.size() ? Indices.size() : Factor; + Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / Factor); + unsigned NumOfResults = + getTLI()->getTypeLegalizationCost(DL, ResultTy).first * + NumOfLoadsInInterleaveGrp; + + // About a half of the loads may be folded in shuffles when we have only + // one result. If we have more than one result, we do not fold loads at all. + unsigned NumOfUnfoldedLoads = + NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; + + // Get a number of shuffle operations per result. + unsigned NumOfShufflesPerResult = + std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); + + // The SK_MergeTwoSrc shuffle clobbers one of src operands. + // When we have more than one destination, we need additional instructions + // to keep sources. + unsigned NumOfMoves = 0; + if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) + NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; + + int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + + NumOfUnfoldedLoads * MemOpCost + NumOfMoves; + + return Cost; + } + + // Store. + assert(Opcode == Instruction::Store && + "Expected Store Instruction at this point"); + + // There is no strided stores meanwhile. And store can't be folded in + // shuffle. + unsigned NumOfSources = Factor; // The number of values to be merged. + unsigned ShuffleCost = + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); + unsigned NumOfShufflesPerStore = NumOfSources - 1; + + // The SK_MergeTwoSrc shuffle clobbers one of src operands. + // We need additional instructions to keep sources. + unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; + int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + + NumOfMoves; + return Cost; +} + +int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) { + RequiresBW = false; + Type *EltTy = VecTy->getVectorElementType(); + if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || + EltTy->isIntegerTy(32) || EltTy->isPointerTy()) + return true; + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) { + RequiresBW = true; + return true; + } + return false; + }; + bool RequiresBW; + bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW); + if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI())) + return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h index ab8046b..ecaaf95 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -43,13 +43,6 @@ public: : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} - // Provide value semantics. MSVC requires that we spell all of these out. - X86TTIImpl(const X86TTIImpl &Arg) - : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} - X86TTIImpl(X86TTIImpl &&Arg) - : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), - TLI(std::move(Arg.TLI)) {} - /// \name Scalar TTI Implementations /// @{ TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); @@ -67,7 +60,8 @@ public: TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, - TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, + ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); @@ -78,7 +72,8 @@ public: unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment); - int getAddressComputationCost(Type *PtrTy, bool IsComplex); + int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF); @@ -87,6 +82,13 @@ public: int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef<unsigned> Indices, + unsigned Alignment, unsigned AddressSpace); + int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, + unsigned Factor, ArrayRef<unsigned> Indices, + unsigned Alignment, unsigned AddressSpace); + int getIntImmCost(int64_t); int getIntImmCost(const APInt &Imm, Type *Ty); @@ -100,6 +102,8 @@ public: bool isLegalMaskedScatter(Type *DataType); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; + + bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, unsigned Alignment, unsigned AddressSpace); diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp index 9320e1e..9766b84 100644 --- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -40,9 +40,9 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( - MachineFunctionProperties::Property::AllVRegsAllocated); + MachineFunctionProperties::Property::NoVRegs); } - const char *getPassName() const override {return "X86 vzeroupper inserter";} + StringRef getPassName() const override { return "X86 vzeroupper inserter"; } private: diff --git a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp index cc82074..fc08f15 100644 --- a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp @@ -63,7 +63,7 @@ private: unsigned SlotSize; int64_t StackProbeSize; - const char *getPassName() const override { return "X86 WinAlloca Expander"; } + StringRef getPassName() const override { return "X86 WinAlloca Expander"; } static char ID; }; @@ -225,6 +225,7 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { break; // Fall through to make any remaining adjustment. + LLVM_FALLTHROUGH; case Sub: assert(Amount > 0); if (Amount == SlotSize) { diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp index 99387ed..bc14630 100644 --- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp @@ -57,7 +57,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override; - const char *getPassName() const override { + StringRef getPassName() const override { return "Windows 32-bit x86 EH state insertion"; } |