summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp585
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h4
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp24
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp12
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h2
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp3
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp254
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h5
-rw-r--r--contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp3
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp18
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h244
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp3
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp195
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp6
-rw-r--r--contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h10
-rw-r--r--contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp17
-rw-r--r--contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp26
-rw-r--r--contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h2
-rw-r--r--contrib/llvm/lib/Target/X86/X86.h7
-rw-r--r--contrib/llvm/lib/Target/X86/X86.td75
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp18
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmPrinter.h27
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp14
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallLowering.cpp46
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallLowering.h39
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.cpp208
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.h38
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.td276
-rwxr-xr-xcontrib/llvm/lib/Target/X86/X86EvexToVex.cpp213
-rw-r--r--contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp24
-rw-r--r--contrib/llvm/lib/Target/X86/X86FastISel.cpp331
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp10
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp4
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp5
-rw-r--r--contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp51
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.cpp273
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.h30
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp140
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp8595
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.h209
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrAVX512.td3131
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrArithmetic.td2
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrBuilder.h74
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrCompiler.td74
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrControl.td22
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFMA.td176
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp285
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h315
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFPStack.td12
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFormats.td149
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td311
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.cpp3727
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.h76
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.td47
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrMMX.td9
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSSE.td2029
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td37
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSystem.td9
-rwxr-xr-xcontrib/llvm/lib/Target/X86/X86InstrTablesInfo.h1162
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrXOP.td87
-rw-r--r--contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp221
-rw-r--r--contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h782
-rw-r--r--contrib/llvm/lib/Target/X86/X86MCInstLower.cpp264
-rw-r--r--contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp18
-rw-r--r--contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp4
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp133
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.h5
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.td2
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp4
-rw-r--r--contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp403
-rw-r--r--contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h4
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.cpp28
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.h38
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.cpp105
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.h2
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp34
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.h24
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp1144
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h22
-rw-r--r--contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp4
-rw-r--r--contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp3
-rw-r--r--contrib/llvm/lib/Target/X86/X86WinEHState.cpp2
82 files changed, 18583 insertions, 8443 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 4e0ad8bf..e692118 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -59,6 +59,7 @@ class X86AsmParser : public MCTargetAsmParser {
const MCInstrInfo &MII;
ParseInstructionInfo *InstInfo;
std::unique_ptr<X86AsmInstrumentation> Instrumentation;
+ bool Code16GCC;
private:
SMLoc consumeToken() {
@@ -68,6 +69,19 @@ private:
return Result;
}
+ unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst,
+ uint64_t &ErrorInfo, bool matchingInlineAsm,
+ unsigned VariantID = 0) {
+ // In Code16GCC mode, match as 32-bit.
+ if (Code16GCC)
+ SwitchMode(X86::Mode32Bit);
+ unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+ matchingInlineAsm, VariantID);
+ if (Code16GCC)
+ SwitchMode(X86::Mode16Bit);
+ return rv;
+ }
+
enum InfixCalculatorTok {
IC_OR = 0,
IC_XOR,
@@ -659,20 +673,15 @@ private:
}
};
- bool Error(SMLoc L, const Twine &Msg,
- ArrayRef<SMRange> Ranges = None,
+ bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
bool MatchingInlineAsm = false) {
MCAsmParser &Parser = getParser();
- if (MatchingInlineAsm) return true;
- return Parser.Error(L, Msg, Ranges);
- }
-
- bool ErrorAndEatStatement(SMLoc L, const Twine &Msg,
- ArrayRef<SMRange> Ranges = None,
- bool MatchingInlineAsm = false) {
- MCAsmParser &Parser = getParser();
- Parser.eatToEndOfStatement();
- return Error(L, Msg, Ranges, MatchingInlineAsm);
+ if (MatchingInlineAsm) {
+ if (!getLexer().isAtStartOfStatement())
+ Parser.eatToEndOfStatement();
+ return false;
+ }
+ return Parser.Error(L, Msg, Range);
}
std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
@@ -698,14 +707,11 @@ private:
std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
std::unique_ptr<X86Operand>
ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
- std::unique_ptr<X86Operand>
- ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size);
std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
- std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg,
- SMLoc Start,
- int64_t ImmDisp,
- unsigned Size);
+ std::unique_ptr<X86Operand>
+ ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp,
+ bool isSymbol, unsigned Size);
bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
InlineAsmIdentifierInfo &Info,
bool IsUnevaluatedOperand, SMLoc &End);
@@ -716,7 +722,8 @@ private:
CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
unsigned IndexReg, unsigned Scale, SMLoc Start,
SMLoc End, unsigned Size, StringRef Identifier,
- InlineAsmIdentifierInfo &Info);
+ InlineAsmIdentifierInfo &Info,
+ bool AllowBetterSizeMatch = false);
bool parseDirectiveEven(SMLoc L);
bool ParseDirectiveWord(unsigned Size, SMLoc L);
@@ -753,10 +760,17 @@ private:
/// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
/// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
- /// \return \c true if no parsing errors occurred, \c false otherwise.
+ /// return false if no parsing errors occurred, true otherwise.
bool HandleAVX512Operand(OperandVector &Operands,
const MCParsedAsmOperand &Op);
+ bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);
+
+ /// MS-compatibility:
+ /// Obtain an appropriate size qualifier, when facing its absence,
+ /// upon AVX512 vector/broadcast memory operand
+ unsigned AdjustAVX512Mem(unsigned Size, X86Operand* UnsizedMemOpNext);
+
bool is64BitMode() const {
// FIXME: Can tablegen auto-generate this?
return getSTI().getFeatureBits()[X86::Mode64Bit];
@@ -802,7 +816,8 @@ private:
public:
X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
const MCInstrInfo &mii, const MCTargetOptions &Options)
- : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) {
+ : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr),
+ Code16GCC(false) {
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
@@ -833,6 +848,11 @@ static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
// If we have both a base register and an index register make sure they are
// both 64-bit or 32-bit registers.
// To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+
+ if ((BaseReg == X86::RIP && IndexReg != 0) || (IndexReg == X86::RIP)) {
+ ErrMsg = "invalid base+index expression";
+ return true;
+ }
if (BaseReg != 0 && IndexReg != 0) {
if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
(X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
@@ -907,8 +927,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
if (RegNo == X86::RIZ ||
X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
X86II::isX86_64NonExtLowByteReg(RegNo) ||
- X86II::isX86_64ExtendedReg(RegNo) ||
- X86II::is32ExtendedReg(RegNo))
+ X86II::isX86_64ExtendedReg(RegNo))
return Error(StartLoc, "register %"
+ Tok.getString() + " is only available in 64-bit mode",
SMRange(StartLoc, EndLoc));
@@ -992,20 +1011,20 @@ void X86AsmParser::SetFrameRegister(unsigned RegNo) {
}
std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
- unsigned basereg =
- is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
const MCExpr *Disp = MCConstantExpr::create(0, getContext());
return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
- /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
+ /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
Loc, Loc, 0);
}
std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
- unsigned basereg =
- is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned Basereg = is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
const MCExpr *Disp = MCConstantExpr::create(0, getContext());
return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
- /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
+ /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
Loc, Loc, 0);
}
@@ -1159,7 +1178,7 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
- InlineAsmIdentifierInfo &Info) {
+ InlineAsmIdentifierInfo &Info, bool AllowBetterSizeMatch) {
// If we found a decl other than a VarDecl, then assume it is a FuncDecl or
// some other label reference.
if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) {
@@ -1188,6 +1207,13 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
if (Size)
InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
/*Len=*/0, Size);
+ if (AllowBetterSizeMatch)
+ // Handle cases where size qualifier is absent, upon an indirect symbol
+ // reference - e.g. "vaddps zmm1, zmm2, [var]"
+ // set Size to zero to allow matching mechansim to try and find a better
+ // size qualifier than our initial guess, based on available variants of
+ // the given instruction
+ Size = 0;
}
}
@@ -1271,7 +1297,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
// The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
// identifier. Don't try an parse it as a register.
- if (Tok.getString().startswith("."))
+ if (PrevTK != AsmToken::Error && Tok.getString().startswith("."))
break;
// If we're parsing an immediate expression, we don't expect a '['.
@@ -1386,7 +1412,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
std::unique_ptr<X86Operand>
X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
- int64_t ImmDisp, unsigned Size) {
+ int64_t ImmDisp, bool isSymbol,
+ unsigned Size) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
@@ -1436,6 +1463,21 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
Disp = NewDisp;
}
+ if (isSymbol) {
+ if (SM.getSym()) {
+ Error(Start, "cannot use more than one symbol in memory operand");
+ return nullptr;
+ }
+ if (SM.getBaseReg()) {
+ Error(Start, "cannot use base register with variable reference");
+ return nullptr;
+ }
+ if (SM.getIndexReg()) {
+ Error(Start, "cannot use index register with variable reference");
+ return nullptr;
+ }
+ }
+
int BaseReg = SM.getBaseReg();
int IndexReg = SM.getIndexReg();
int Scale = SM.getScale();
@@ -1458,7 +1500,8 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
- End, Size, SM.getSymName(), Info);
+ End, Size, SM.getSymName(), Info,
+ isParsingInlineAsm());
}
// Inline assembly may use variable names with namespace alias qualifiers.
@@ -1541,7 +1584,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
}
if (getLexer().is(AsmToken::LBrac))
- return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+ return ParseIntelBracExpression(SegReg, Start, ImmDisp, false, Size);
const MCExpr *Val;
SMLoc End;
@@ -1598,66 +1641,6 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
}
return ErrorOperand(Tok.getLoc(), "unknown token in expression");
}
-/// ParseIntelMemOperand - Parse intel style memory operand.
-std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
- SMLoc Start,
- unsigned Size) {
- MCAsmParser &Parser = getParser();
- const AsmToken &Tok = Parser.getTok();
- SMLoc End;
-
- // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
- if (getLexer().is(AsmToken::LBrac))
- return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size);
- assert(ImmDisp == 0);
-
- const MCExpr *Val;
- if (!isParsingInlineAsm()) {
- if (getParser().parsePrimaryExpr(Val, End))
- return ErrorOperand(Tok.getLoc(), "unknown token in expression");
-
- return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
- }
-
- InlineAsmIdentifierInfo Info;
- StringRef Identifier = Tok.getString();
- if (ParseIntelIdentifier(Val, Identifier, Info,
- /*Unevaluated=*/false, End))
- return nullptr;
-
- if (!getLexer().is(AsmToken::LBrac))
- return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
- /*Scale=*/1, Start, End, Size, Identifier, Info);
-
- Parser.Lex(); // Eat '['
-
- // Parse Identifier [ ImmDisp ]
- IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true,
- /*AddImmPrefix=*/false);
- if (ParseIntelExpression(SM, End))
- return nullptr;
-
- if (SM.getSym()) {
- Error(Start, "cannot use more than one symbol in memory operand");
- return nullptr;
- }
- if (SM.getBaseReg()) {
- Error(Start, "cannot use base register with variable reference");
- return nullptr;
- }
- if (SM.getIndexReg()) {
- Error(Start, "cannot use index register with variable reference");
- return nullptr;
- }
-
- const MCExpr *Disp = MCConstantExpr::create(SM.getImm(), getContext());
- // BaseReg is non-zero to avoid assertions. In the context of inline asm,
- // we're pointing to a local variable in memory, so the base register is
- // really the frame or stack pointer.
- return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
- /*BaseReg=*/1, /*IndexReg=*/0, /*Scale=*/1,
- Start, End, Size, Identifier, Info.OpDecl);
-}
/// Parse the '.' operator.
bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
@@ -1725,8 +1708,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
// The offset operator will have an 'r' constraint, thus we need to create
// register operand to ensure proper matching. Just pick a GPR based on
// the size of a pointer.
- unsigned RegNo =
- is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX);
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX);
+
return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
OffsetOfLoc, Identifier, Info.OpDecl);
}
@@ -1804,49 +1788,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
Parser.Lex(); // Eat ptr.
PtrInOperand = true;
}
- Start = Tok.getLoc();
- // Immediate.
- if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
- getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) {
- AsmToken StartTok = Tok;
- IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
- /*AddImmPrefix=*/false);
- if (ParseIntelExpression(SM, End))
- return nullptr;
-
- int64_t Imm = SM.getImm();
- if (isParsingInlineAsm()) {
- unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
- if (StartTok.getString().size() == Len)
- // Just add a prefix if this wasn't a complex immediate expression.
- InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
- else
- // Otherwise, rewrite the complex expression as a single immediate.
- InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
- }
-
- if (getLexer().isNot(AsmToken::LBrac)) {
- // If a directional label (ie. 1f or 2b) was parsed above from
- // ParseIntelExpression() then SM.getSym() was set to a pointer to
- // to the MCExpr with the directional local symbol and this is a
- // memory operand not an immediate operand.
- if (SM.getSym())
- return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
- Size);
-
- const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext());
- return X86Operand::CreateImm(ImmExpr, Start, End);
- }
-
- // Only positive immediates are valid.
- if (Imm < 0)
- return ErrorOperand(Start, "expected a positive immediate displacement "
- "before bracketed expr.");
-
- // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
- return ParseIntelMemOperand(Imm, Start, Size);
- }
+ Start = Tok.getLoc();
// rounding mode token
if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
@@ -1855,24 +1798,78 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
// Register.
unsigned RegNo = 0;
- if (!ParseRegister(RegNo, Start, End)) {
+ if (getLexer().is(AsmToken::Identifier) &&
+ !ParseRegister(RegNo, Start, End)) {
// If this is a segment register followed by a ':', then this is the start
// of a segment override, otherwise this is a normal register reference.
- // In case it is a normal register and there is ptr in the operand this
+ // In case it is a normal register and there is ptr in the operand this
// is an error
- if (getLexer().isNot(AsmToken::Colon)){
- if (PtrInOperand){
+ if (RegNo == X86::RIP)
+ return ErrorOperand(Start, "rip can only be used as a base register");
+ if (getLexer().isNot(AsmToken::Colon)) {
+ if (PtrInOperand) {
return ErrorOperand(Start, "expected memory operand after "
"'ptr', found register operand instead");
}
return X86Operand::CreateReg(RegNo, Start, End);
}
-
return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
}
- // Memory operand.
- return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
+ // Immediates and Memory
+
+ // Parse [ BaseReg + Scale*IndexReg + Disp ].
+ if (getLexer().is(AsmToken::LBrac))
+ return ParseIntelBracExpression(/*SegReg=*/0, Start, /*ImmDisp=*/0, false,
+ Size);
+
+ AsmToken StartTok = Tok;
+ IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
+ /*AddImmPrefix=*/false);
+ if (ParseIntelExpression(SM, End))
+ return nullptr;
+
+ bool isSymbol = SM.getSym() && SM.getSym()->getKind() != MCExpr::Constant;
+ int64_t Imm = SM.getImm();
+ if (SM.getSym() && SM.getSym()->getKind() == MCExpr::Constant)
+ SM.getSym()->evaluateAsAbsolute(Imm);
+
+ if (StartTok.isNot(AsmToken::Identifier) &&
+ StartTok.isNot(AsmToken::String) && isParsingInlineAsm()) {
+ unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
+ if (StartTok.getString().size() == Len)
+ // Just add a prefix if this wasn't a complex immediate expression.
+ InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
+ else
+ // Otherwise, rewrite the complex expression as a single immediate.
+ InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
+ }
+
+ if (getLexer().isNot(AsmToken::LBrac)) {
+ // If a directional label (ie. 1f or 2b) was parsed above from
+ // ParseIntelExpression() then SM.getSym() was set to a pointer to
+ // to the MCExpr with the directional local symbol and this is a
+ // memory operand not an immediate operand.
+ if (isSymbol) {
+ if (isParsingInlineAsm())
+ return CreateMemForInlineAsm(/*SegReg=*/0, SM.getSym(), /*BaseReg=*/0,
+ /*IndexReg=*/0,
+ /*Scale=*/1, Start, End, Size,
+ SM.getSymName(), SM.getIdentifierInfo());
+ return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
+ Size);
+ }
+
+ const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext());
+ return X86Operand::CreateImm(ImmExpr, Start, End);
+ }
+
+ // Only positive immediates are valid.
+ if (Imm < 0)
+ return ErrorOperand(Start, "expected a positive immediate displacement "
+ "before bracketed expr.");
+
+ return ParseIntelBracExpression(/*SegReg=*/0, Start, Imm, isSymbol, Size);
}
std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
@@ -1891,6 +1888,11 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
SMRange(Start, End));
return nullptr;
}
+ if (RegNo == X86::RIP) {
+ Error(Start, "%rip can only be used as a base register",
+ SMRange(Start, End));
+ return nullptr;
+ }
// If this is a segment register followed by a ':', then this is the start
// of a memory reference, otherwise this is a normal register reference.
@@ -1916,11 +1918,33 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
SMLoc Start = Parser.getTok().getLoc(), End;
if (getSTI().getFeatureBits()[X86::FeatureAVX512])
return ParseRoundingModeOp(Start, End);
- return ErrorOperand(Start, "unknown token in expression");
+ return ErrorOperand(Start, "Unexpected '{' in expression");
}
}
}
+// true on failure, false otherwise
+// If no {z} mark was found - Parser doesn't advance
+bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
+ const SMLoc &StartLoc) {
+ MCAsmParser &Parser = getParser();
+ // Assuming we are just pass the '{' mark, quering the next token
+ // Searched for {z}, but none was found. Return false, as no parsing error was
+ // encountered
+ if (!(getLexer().is(AsmToken::Identifier) &&
+ (getLexer().getTok().getIdentifier() == "z")))
+ return false;
+ Parser.Lex(); // Eat z
+ // Query and eat the '}' mark
+ if (!getLexer().is(AsmToken::RCurly))
+ return Error(getLexer().getLoc(), "Expected } at this point");
+ Parser.Lex(); // Eat '}'
+ // Assign Z with the {z} mark opernad
+ Z = X86Operand::CreateToken("{z}", StartLoc);
+ return false;
+}
+
+// true on failure, false otherwise
bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
const MCParsedAsmOperand &Op) {
MCAsmParser &Parser = getParser();
@@ -1932,13 +1956,11 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
if(getLexer().is(AsmToken::Integer)) {
// Parse memory broadcasting ({1to<NUM>}).
if (getLexer().getTok().getIntVal() != 1)
- return !ErrorAndEatStatement(getLexer().getLoc(),
- "Expected 1to<NUM> at this point");
+ return TokError("Expected 1to<NUM> at this point");
Parser.Lex(); // Eat "1" of 1to8
if (!getLexer().is(AsmToken::Identifier) ||
!getLexer().getTok().getIdentifier().startswith("to"))
- return !ErrorAndEatStatement(getLexer().getLoc(),
- "Expected 1to<NUM> at this point");
+ return TokError("Expected 1to<NUM> at this point");
// Recognize only reasonable suffixes.
const char *BroadcastPrimitive =
StringSwitch<const char*>(getLexer().getTok().getIdentifier())
@@ -1948,46 +1970,57 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
.Case("to16", "{1to16}")
.Default(nullptr);
if (!BroadcastPrimitive)
- return !ErrorAndEatStatement(getLexer().getLoc(),
- "Invalid memory broadcast primitive.");
+ return TokError("Invalid memory broadcast primitive.");
Parser.Lex(); // Eat "toN" of 1toN
if (!getLexer().is(AsmToken::RCurly))
- return !ErrorAndEatStatement(getLexer().getLoc(),
- "Expected } at this point");
+ return TokError("Expected } at this point");
Parser.Lex(); // Eat "}"
Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
consumedToken));
// No AVX512 specific primitives can pass
// after memory broadcasting, so return.
- return true;
+ return false;
} else {
- // Parse mask register {%k1}
- Operands.push_back(X86Operand::CreateToken("{", consumedToken));
- if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
- Operands.push_back(std::move(Op));
- if (!getLexer().is(AsmToken::RCurly))
- return !ErrorAndEatStatement(getLexer().getLoc(),
- "Expected } at this point");
- Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
-
- // Parse "zeroing non-masked" semantic {z}
- if (getLexer().is(AsmToken::LCurly)) {
- Operands.push_back(X86Operand::CreateToken("{z}", consumeToken()));
- if (!getLexer().is(AsmToken::Identifier) ||
- getLexer().getTok().getIdentifier() != "z")
- return !ErrorAndEatStatement(getLexer().getLoc(),
- "Expected z at this point");
- Parser.Lex(); // Eat the z
+ // Parse either {k}{z}, {z}{k}, {k} or {z}
+ // last one have no meaning, but GCC accepts it
+ // Currently, we're just pass a '{' mark
+ std::unique_ptr<X86Operand> Z;
+ if (ParseZ(Z, consumedToken))
+ return true;
+ // Reaching here means that parsing of the allegadly '{z}' mark yielded
+ // no errors.
+ // Query for the need of further parsing for a {%k<NUM>} mark
+ if (!Z || getLexer().is(AsmToken::LCurly)) {
+ const SMLoc StartLoc = Z ? consumeToken() : consumedToken;
+ // Parse an op-mask register mark ({%k<NUM>}), which is now to be
+ // expected
+ if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
if (!getLexer().is(AsmToken::RCurly))
- return !ErrorAndEatStatement(getLexer().getLoc(),
- "Expected } at this point");
- Parser.Lex(); // Eat the }
+ return Error(getLexer().getLoc(), "Expected } at this point");
+ Operands.push_back(X86Operand::CreateToken("{", StartLoc));
+ Operands.push_back(std::move(Op));
+ Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+ } else
+ return Error(getLexer().getLoc(),
+ "Expected an op-mask register at this point");
+ // {%k<NUM>} mark is found, inquire for {z}
+ if (getLexer().is(AsmToken::LCurly) && !Z) {
+ // Have we've found a parsing error, or found no (expected) {z} mark
+ // - report an error
+ if (ParseZ(Z, consumeToken()) || !Z)
+ return true;
+
}
+ // '{z}' on its own is meaningless, hence should be ignored.
+ // on the contrary - have it been accompanied by a K register,
+ // allow it.
+ if (Z)
+ Operands.push_back(std::move(Z));
}
}
}
}
- return true;
+ return false;
}
/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix
@@ -2077,7 +2110,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
// like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
if (getLexer().is(AsmToken::Percent)) {
SMLoc L;
- if (ParseRegister(IndexReg, L, L)) return nullptr;
+ if (ParseRegister(IndexReg, L, L))
+ return nullptr;
+ if (BaseReg == X86::RIP) {
+ Error(IndexLoc, "%rip as base register can not have an index register");
+ return nullptr;
+ }
+ if (IndexReg == X86::RIP) {
+ Error(IndexLoc, "%rip is not allowed as an index register");
+ return nullptr;
+ }
if (getLexer().isNot(AsmToken::RParen)) {
// Parse the scale amount:
@@ -2169,6 +2211,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
InstInfo = &Info;
StringRef PatchedName = Name;
+ if (Name == "jmp" && isParsingIntelSyntax() && isParsingInlineAsm()) {
+ StringRef NextTok = Parser.getTok().getString();
+ if (NextTok == "short") {
+ SMLoc NameEndLoc =
+ NameLoc.getFromPointer(NameLoc.getPointer() + Name.size());
+ // Eat the short keyword
+ Parser.Lex();
+ // MS ignores the short keyword, it determines the jmp type based
+ // on the distance of the label
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc,
+ NextTok.size() + 1);
+ }
+ }
+
// FIXME: Hack to recognize setneb as setne.
if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
PatchedName != "setb" && PatchedName != "setnb")
@@ -2321,10 +2377,9 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
while(1) {
if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
Operands.push_back(std::move(Op));
- if (!HandleAVX512Operand(Operands, *Operands.back()))
+ if (HandleAVX512Operand(Operands, *Operands.back()))
return true;
} else {
- Parser.eatToEndOfStatement();
return true;
}
// check for comma and eat it
@@ -2340,8 +2395,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
isParsingIntelSyntax() && isParsingInlineAsm() &&
(getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
- return ErrorAndEatStatement(getLexer().getLoc(),
- "unexpected token in argument list");
+ return TokError("unexpected token in argument list");
}
// Consume the EndOfStatement or the prefix separator Slash
@@ -2367,6 +2421,30 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
}
+ // Moving a 32 or 16 bit value into a segment register has the same
+ // behavior. Modify such instructions to always take shorter form.
+ if ((Name == "mov" || Name == "movw" || Name == "movl") &&
+ (Operands.size() == 3)) {
+ X86Operand &Op1 = (X86Operand &)*Operands[1];
+ X86Operand &Op2 = (X86Operand &)*Operands[2];
+ SMLoc Loc = Op1.getEndLoc();
+ if (Op1.isReg() && Op2.isReg() &&
+ X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(
+ Op2.getReg()) &&
+ (X86MCRegisterClasses[X86::GR16RegClassID].contains(Op1.getReg()) ||
+ X86MCRegisterClasses[X86::GR32RegClassID].contains(Op1.getReg()))) {
+ // Change instruction name to match new instruction.
+ if (Name != "mov" && Name[3] == (is16BitMode() ? 'l' : 'w')) {
+ Name = is16BitMode() ? "movw" : "movl";
+ Operands[0] = X86Operand::CreateToken(Name, NameLoc);
+ }
+ // Select the correct equivalent 16-/32-bit source register.
+ unsigned Reg =
+ getX86SubSuperRegisterOrZero(Op1.getReg(), is16BitMode() ? 16 : 32);
+ Operands[1] = X86Operand::CreateReg(Reg, Loc, Loc);
+ }
+ }
+
// This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" ->
// "outb %al, %dx". Out doesn't take a memory form, but this is a widely
// documented form in various unofficial manuals, so a lot of code uses it.
@@ -2472,7 +2550,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
(Name == "smov" || Name == "smovb" || Name == "smovw" ||
Name == "smovl" || Name == "smovd" || Name == "smovq"))) &&
(Operands.size() == 1 || Operands.size() == 3)) {
- if (Name == "movsd" && Operands.size() == 1)
+ if (Name == "movsd" && Operands.size() == 1 && !isParsingIntelSyntax())
Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
DefaultMemDIOperand(NameLoc));
@@ -2583,7 +2661,6 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
bool MatchingInlineAsm) {
assert(ErrorInfo && "Unknown missing feature!");
- ArrayRef<SMRange> EmptyRanges = None;
SmallString<126> Msg;
raw_svector_ostream OS(Msg);
OS << "instruction requires:";
@@ -2593,7 +2670,7 @@ bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
Mask <<= 1;
}
- return Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+ return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
}
bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -2604,7 +2681,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
assert(!Operands.empty() && "Unexpect empty operand list!");
X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
assert(Op.isToken() && "Leading operand should always be a mnemonic!");
- ArrayRef<SMRange> EmptyRanges = None;
+ SMRange EmptyRange = None;
// First, handle aliases that expand to multiple instructions.
MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
@@ -2613,9 +2690,8 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
MCInst Inst;
// First, try a direct match.
- switch (MatchInstructionImpl(Operands, Inst,
- ErrorInfo, MatchingInlineAsm,
- isParsingIntelSyntax())) {
+ switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
+ isParsingIntelSyntax())) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
// Some instructions need post-processing to, for example, tweak which
@@ -2666,8 +2742,8 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
Tmp.back() = Suffixes[I];
- Match[I] = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
- MatchingInlineAsm, isParsingIntelSyntax());
+ Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
+ MatchingInlineAsm, isParsingIntelSyntax());
// If this returned as a missing feature failure, remember that.
if (Match[I] == Match_MissingFeature)
ErrorInfoMissingFeature = ErrorInfoIgnore;
@@ -2711,7 +2787,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
OS << "'" << Base << MatchChars[i] << "'";
}
OS << ")";
- Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+ Error(IDLoc, OS.str(), EmptyRange, MatchingInlineAsm);
return true;
}
@@ -2721,17 +2797,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
// mnemonic was invalid.
if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
if (!WasOriginallyInvalidOperand) {
- ArrayRef<SMRange> Ranges =
- MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
- Ranges, MatchingInlineAsm);
+ Op.getLocRange(), MatchingInlineAsm);
}
// Recover location info for the operand if we know which was the problem.
if (ErrorInfo != ~0ULL) {
if (ErrorInfo >= Operands.size())
- return Error(IDLoc, "too few operands for instruction",
- EmptyRanges, MatchingInlineAsm);
+ return Error(IDLoc, "too few operands for instruction", EmptyRange,
+ MatchingInlineAsm);
X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
if (Operand.getStartLoc().isValid()) {
@@ -2741,7 +2815,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
}
}
- return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
MatchingInlineAsm);
}
@@ -2758,16 +2832,33 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
// operand failure.
if (std::count(std::begin(Match), std::end(Match),
Match_InvalidOperand) == 1) {
- return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
MatchingInlineAsm);
}
// If all of these were an outright failure, report it in a useless way.
Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
- EmptyRanges, MatchingInlineAsm);
+ EmptyRange, MatchingInlineAsm);
return true;
}
+unsigned X86AsmParser::AdjustAVX512Mem(unsigned Size,
+ X86Operand* UnsizedMemOpNext) {
+ // Check for the existence of an AVX512 platform
+ if (!getSTI().getFeatureBits()[X86::FeatureAVX512])
+ return 0;
+ // Allow adjusting upon a (x|y|z)mm
+ if (Size == 512 || Size == 256 || Size == 128)
+ return Size;
+ // This is an allegadly broadcasting mem op adjustment,
+ // allow some more inquiring to validate it
+ if (Size == 64 || Size == 32)
+ return UnsizedMemOpNext && UnsizedMemOpNext->isToken() &&
+ UnsizedMemOpNext->getToken().substr(0, 4).equals("{1to") ? Size : 0;
+ // Do not allow any other type of adjustments
+ return 0;
+}
+
bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
@@ -2777,7 +2868,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
assert(Op.isToken() && "Leading operand should always be a mnemonic!");
StringRef Mnemonic = Op.getToken();
- ArrayRef<SMRange> EmptyRanges = None;
+ SMRange EmptyRange = None;
+ StringRef Base = Op.getToken();
// First, handle aliases that expand to multiple instructions.
MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
@@ -2786,8 +2878,17 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// Find one unsized memory operand, if present.
X86Operand *UnsizedMemOp = nullptr;
+ // If unsized memory operand was found - obtain following operand.
+ // For use in AdjustAVX512Mem
+ X86Operand *UnsizedMemOpNext = nullptr;
for (const auto &Op : Operands) {
X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+ if (UnsizedMemOp) {
+ UnsizedMemOpNext = X86Op;
+ // Have we found an unqualified memory operand,
+ // break. IA allows only one memory operand.
+ break;
+ }
if (X86Op->isMemUnsized())
UnsizedMemOp = X86Op;
}
@@ -2804,26 +2905,58 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
}
}
+ SmallVector<unsigned, 8> Match;
+ uint64_t ErrorInfoMissingFeature = 0;
+
+ // If unsized push has immediate operand we should default the default pointer
+ // size for the size.
+ if (Mnemonic == "push" && Operands.size() == 2) {
+ auto *X86Op = static_cast<X86Operand *>(Operands[1].get());
+ if (X86Op->isImm()) {
+ // If it's not a constant fall through and let remainder take care of it.
+ const auto *CE = dyn_cast<MCConstantExpr>(X86Op->getImm());
+ unsigned Size = getPointerWidth();
+ if (CE &&
+ (isIntN(Size, CE->getValue()) || isUIntN(Size, CE->getValue()))) {
+ SmallString<16> Tmp;
+ Tmp += Base;
+ Tmp += (is64BitMode())
+ ? "q"
+ : (is32BitMode()) ? "l" : (is16BitMode()) ? "w" : " ";
+ Op.setTokenValue(Tmp);
+ // Do match in ATT mode to allow explicit suffix usage.
+ Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo,
+ MatchingInlineAsm,
+ false /*isParsingIntelSyntax()*/));
+ Op.setTokenValue(Base);
+ }
+ }
+ }
+
// If an unsized memory operand is present, try to match with each memory
// operand size. In Intel assembly, the size is not part of the instruction
// mnemonic.
- SmallVector<unsigned, 8> Match;
- uint64_t ErrorInfoMissingFeature = 0;
+ unsigned MatchedSize = 0;
if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
for (unsigned Size : MopSizes) {
UnsizedMemOp->Mem.Size = Size;
uint64_t ErrorInfoIgnore;
unsigned LastOpcode = Inst.getOpcode();
- unsigned M =
- MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
- MatchingInlineAsm, isParsingIntelSyntax());
+ unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
+ MatchingInlineAsm, isParsingIntelSyntax());
if (Match.empty() || LastOpcode != Inst.getOpcode())
Match.push_back(M);
// If this returned as a missing feature failure, remember that.
if (Match.back() == Match_MissingFeature)
ErrorInfoMissingFeature = ErrorInfoIgnore;
+ if (M == Match_Success)
+ // MS-compatability:
+ // Adjust AVX512 vector/broadcast memory operand,
+ // when facing the absence of a size qualifier.
+ // Match GCC behavior on respective cases.
+ MatchedSize = AdjustAVX512Mem(Size, UnsizedMemOpNext);
}
// Restore the size of the unsized memory operand if we modified it.
@@ -2835,9 +2968,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// operation. There shouldn't be any ambiguity in our mnemonic table, so try
// matching with the unsized operand.
if (Match.empty()) {
- Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo,
- MatchingInlineAsm,
- isParsingIntelSyntax()));
+ Match.push_back(MatchInstruction(
+ Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()));
// If this returned as a missing feature failure, remember that.
if (Match.back() == Match_MissingFeature)
ErrorInfoMissingFeature = ErrorInfo;
@@ -2849,10 +2981,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// If it's a bad mnemonic, all results will be the same.
if (Match.back() == Match_MnemonicFail) {
- ArrayRef<SMRange> Ranges =
- MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'",
- Ranges, MatchingInlineAsm);
+ Op.getLocRange(), MatchingInlineAsm);
}
// If exactly one matched, then we treat that as a successful match (and the
@@ -2861,6 +2991,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
unsigned NumSuccessfulMatches =
std::count(std::begin(Match), std::end(Match), Match_Success);
if (NumSuccessfulMatches == 1) {
+ if (MatchedSize && isParsingInlineAsm() && isParsingIntelSyntax())
+ // MS compatibility -
+ // Fix the rewrite according to the matched memory size
+ // MS inline assembly only
+ for (AsmRewrite &AR : *InstInfo->AsmRewrites)
+ if ((AR.Loc.getPointer() == UnsizedMemOp->StartLoc.getPointer()) &&
+ (AR.Kind == AOK_SizeDirective))
+ AR.Val = MatchedSize;
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the individual
// transformations can chain off each other.
@@ -2875,11 +3013,9 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
} else if (NumSuccessfulMatches > 1) {
assert(UnsizedMemOp &&
"multiple matches only possible with unsized memory operands");
- ArrayRef<SMRange> Ranges =
- MatchingInlineAsm ? EmptyRanges : UnsizedMemOp->getLocRange();
return Error(UnsizedMemOp->getStartLoc(),
"ambiguous operand size for instruction '" + Mnemonic + "\'",
- Ranges, MatchingInlineAsm);
+ UnsizedMemOp->getLocRange(), MatchingInlineAsm);
}
// If one instruction matched with a missing feature, report this as a
@@ -2895,12 +3031,12 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// operand failure.
if (std::count(std::begin(Match), std::end(Match),
Match_InvalidOperand) == 1) {
- return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+ return Error(IDLoc, "invalid operand for instruction", EmptyRange,
MatchingInlineAsm);
}
// If all of these were an outright failure, report it in a useless way.
- return Error(IDLoc, "unknown instruction mnemonic", EmptyRanges,
+ return Error(IDLoc, "unknown instruction mnemonic", EmptyRange,
MatchingInlineAsm);
}
@@ -2945,14 +3081,14 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
/// parseDirectiveEven
/// ::= .even
bool X86AsmParser::parseDirectiveEven(SMLoc L) {
- const MCSection *Section = getStreamer().getCurrentSection().first;
if (getLexer().isNot(AsmToken::EndOfStatement)) {
TokError("unexpected token in directive");
return false;
}
+ const MCSection *Section = getStreamer().getCurrentSectionOnly();
if (!Section) {
getStreamer().InitSections(false);
- Section = getStreamer().getCurrentSection().first;
+ Section = getStreamer().getCurrentSectionOnly();
}
if (Section->UseCodeAlign())
getStreamer().EmitCodeAlignment(2, 0);
@@ -3001,12 +3137,21 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
/// ::= .code16 | .code32 | .code64
bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
MCAsmParser &Parser = getParser();
+ Code16GCC = false;
if (IDVal == ".code16") {
Parser.Lex();
if (!is16BitMode()) {
SwitchMode(X86::Mode16Bit);
getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
}
+ } else if (IDVal == ".code16gcc") {
+ // .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode.
+ Parser.Lex();
+ Code16GCC = true;
+ if (!is16BitMode()) {
+ SwitchMode(X86::Mode16Bit);
+ getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ }
} else if (IDVal == ".code32") {
Parser.Lex();
if (!is32BitMode()) {
@@ -3029,8 +3174,8 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
// Force static initialization.
extern "C" void LLVMInitializeX86AsmParser() {
- RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
- RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target);
+ RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
+ RegisterMCAsmParser<X86AsmParser> Y(getTheX86_64Target());
}
#define GET_REGISTER_MATCHER
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
index a04c2f5..9db1a84 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -192,8 +192,10 @@ struct X86Operand : public MCParsedAsmOperand {
bool isImmUnsignedi8() const {
if (!isImm()) return false;
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
- if (!CE) return false;
+ if (!CE) return true;
return isImmUnsignedi8Value(CE->getValue());
}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 008dead..0871888 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -96,7 +96,7 @@ void llvm::X86Disassembler::Debug(const char *file, unsigned line,
dbgs() << file << ":" << line << ": " << s;
}
-const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+StringRef llvm::X86Disassembler::GetInstrName(unsigned Opcode,
const void *mii) {
const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
return MII->getName(Opcode);
@@ -470,10 +470,20 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break;
case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break;
case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break;
- case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break;
- case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break;
- case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break;
- case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break;
+ case X86::VCMPPDZ128rmi: NewOpc = X86::VCMPPDZ128rmi_alt; break;
+ case X86::VCMPPDZ128rri: NewOpc = X86::VCMPPDZ128rri_alt; break;
+ case X86::VCMPPSZ128rmi: NewOpc = X86::VCMPPSZ128rmi_alt; break;
+ case X86::VCMPPSZ128rri: NewOpc = X86::VCMPPSZ128rri_alt; break;
+ case X86::VCMPPDZ256rmi: NewOpc = X86::VCMPPDZ256rmi_alt; break;
+ case X86::VCMPPDZ256rri: NewOpc = X86::VCMPPDZ256rri_alt; break;
+ case X86::VCMPPSZ256rmi: NewOpc = X86::VCMPPSZ256rmi_alt; break;
+ case X86::VCMPPSZ256rri: NewOpc = X86::VCMPPSZ256rri_alt; break;
+ case X86::VCMPSDZrm_Int: NewOpc = X86::VCMPSDZrmi_alt; break;
+ case X86::VCMPSDZrr_Int: NewOpc = X86::VCMPSDZrri_alt; break;
+ case X86::VCMPSDZrrb_Int: NewOpc = X86::VCMPSDZrrb_alt; break;
+ case X86::VCMPSSZrm_Int: NewOpc = X86::VCMPSSZrmi_alt; break;
+ case X86::VCMPSSZrr_Int: NewOpc = X86::VCMPSSZrri_alt; break;
+ case X86::VCMPSSZrrb_Int: NewOpc = X86::VCMPSSZrrb_alt; break;
}
// Switch opcode to the one that doesn't get special printing.
mcInst.setOpcode(NewOpc);
@@ -1066,8 +1076,8 @@ static MCDisassembler *createX86Disassembler(const Target &T,
extern "C" void LLVMInitializeX86Disassembler() {
// Register the disassembler.
- TargetRegistry::RegisterMCDisassembler(TheX86_32Target,
+ TargetRegistry::RegisterMCDisassembler(getTheX86_32Target(),
createX86Disassembler);
- TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
+ TargetRegistry::RegisterMCDisassembler(getTheX86_64Target(),
createX86Disassembler);
}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index b0a150a..ab64d6f 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -825,7 +825,7 @@ static int getIDWithAttrMask(uint16_t* instructionID,
* @param orig - The instruction that is not 16-bit
* @param equiv - The instruction that is 16-bit
*/
-static bool is16BitEquivalent(const char* orig, const char* equiv) {
+static bool is16BitEquivalent(const char *orig, const char *equiv) {
off_t i;
for (i = 0;; i++) {
@@ -850,7 +850,7 @@ static bool is16BitEquivalent(const char* orig, const char* equiv) {
*
* @param name - The instruction that is not 16-bit
*/
-static bool is64Bit(const char* name) {
+static bool is64Bit(const char *name) {
off_t i;
for (i = 0;; ++i) {
@@ -1044,9 +1044,9 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
return 0;
}
- const char *SpecName = GetInstrName(instructionIDWithREXW, miiArg);
+ auto SpecName = GetInstrName(instructionIDWithREXW, miiArg);
// If not a 64-bit instruction. Switch the opcode.
- if (!is64Bit(SpecName)) {
+ if (!is64Bit(SpecName.data())) {
insn->instructionID = instructionIDWithREXW;
insn->spec = specifierForUID(instructionIDWithREXW);
return 0;
@@ -1092,7 +1092,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
const struct InstructionSpecifier *spec;
uint16_t instructionIDWithOpsize;
- const char *specName, *specWithOpSizeName;
+ llvm::StringRef specName, specWithOpSizeName;
spec = specifierForUID(instructionID);
@@ -1112,7 +1112,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
specName = GetInstrName(instructionID, miiArg);
specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
- if (is16BitEquivalent(specName, specWithOpSizeName) &&
+ if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) &&
(insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
insn->instructionID = instructionIDWithOpsize;
insn->spec = specifierForUID(instructionIDWithOpsize);
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 24d24a2..b07fd0b 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -674,7 +674,7 @@ int decodeInstruction(InternalInstruction *insn,
/// \param s The message to print.
void Debug(const char *file, unsigned line, const char *s);
-const char *GetInstrName(unsigned Opcode, const void *mii);
+StringRef GetInstrName(unsigned Opcode, const void *mii);
} // namespace X86Disassembler
} // namespace llvm
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 3a5d056..10b7e6f 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -291,6 +291,9 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return printOperand(MI, Op, O);
+
O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
<< markup(">");
}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
index f537956..8594add 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -255,6 +255,10 @@ static std::string getMaskName(const MCInst *MI, const char *DestName,
CASE_MASKZ_UNPCK(UNPCKLPS, r)
CASE_MASKZ_SHUF(PALIGNR, r)
CASE_MASKZ_SHUF(PALIGNR, m)
+ CASE_MASKZ_SHUF(ALIGNQ, r)
+ CASE_MASKZ_SHUF(ALIGNQ, m)
+ CASE_MASKZ_SHUF(ALIGND, r)
+ CASE_MASKZ_SHUF(ALIGND, m)
CASE_MASKZ_SHUF(SHUFPD, m)
CASE_MASKZ_SHUF(SHUFPD, r)
CASE_MASKZ_SHUF(SHUFPS, m)
@@ -277,6 +281,26 @@ static std::string getMaskName(const MCInst *MI, const char *DestName,
CASE_MASKZ_VSHUF(64X2, r)
CASE_MASKZ_VSHUF(32X4, m)
CASE_MASKZ_VSHUF(32X4, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTF64X2, Z128, rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI64X2, Z128, rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF64X2, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI64X2, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF64X4, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI64X4, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X4, Z256, rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X4, Z256, rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X4, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X4, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X8, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X8, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, m)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, m)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, m)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, m)
MaskWithZero = true;
MaskRegName = getRegName(MI->getOperand(1).getReg());
break;
@@ -320,6 +344,10 @@ static std::string getMaskName(const MCInst *MI, const char *DestName,
CASE_MASK_UNPCK(UNPCKLPS, r)
CASE_MASK_SHUF(PALIGNR, r)
CASE_MASK_SHUF(PALIGNR, m)
+ CASE_MASK_SHUF(ALIGNQ, r)
+ CASE_MASK_SHUF(ALIGNQ, m)
+ CASE_MASK_SHUF(ALIGND, r)
+ CASE_MASK_SHUF(ALIGND, m)
CASE_MASK_SHUF(SHUFPD, m)
CASE_MASK_SHUF(SHUFPD, r)
CASE_MASK_SHUF(SHUFPS, m)
@@ -342,6 +370,26 @@ static std::string getMaskName(const MCInst *MI, const char *DestName,
CASE_MASK_VSHUF(64X2, r)
CASE_MASK_VSHUF(32X4, m)
CASE_MASK_VSHUF(32X4, r)
+ CASE_MASK_INS_COMMON(BROADCASTF64X2, Z128, rm)
+ CASE_MASK_INS_COMMON(BROADCASTI64X2, Z128, rm)
+ CASE_MASK_INS_COMMON(BROADCASTF64X2, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI64X2, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTF64X4, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI64X4, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTF32X4, Z256, rm)
+ CASE_MASK_INS_COMMON(BROADCASTI32X4, Z256, rm)
+ CASE_MASK_INS_COMMON(BROADCASTF32X4, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI32X4, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTF32X8, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI32X8, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, r)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, r)
+ CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, m)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, m)
+ CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, r)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, r)
+ CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, m)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, m)
MaskRegName = getRegName(MI->getOperand(2).getReg());
break;
}
@@ -382,7 +430,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VBLENDPDrri:
case X86::VBLENDPDYrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
case X86::BLENDPDrmi:
case X86::VBLENDPDrmi:
case X86::VBLENDPDYrmi:
@@ -398,7 +446,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VBLENDPSrri:
case X86::VBLENDPSYrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
case X86::BLENDPSrmi:
case X86::VBLENDPSrmi:
case X86::VBLENDPSYrmi:
@@ -414,7 +462,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPBLENDWrri:
case X86::VPBLENDWYrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
case X86::PBLENDWrmi:
case X86::VPBLENDWrmi:
case X86::VPBLENDWYrmi:
@@ -429,7 +477,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPBLENDDrri:
case X86::VPBLENDDYrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
case X86::VPBLENDDrmi:
case X86::VPBLENDDYrmi:
if (MI->getOperand(NumOperands - 1).isImm())
@@ -442,12 +490,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::INSERTPSrr:
case X86::VINSERTPSrr:
- case X86::VINSERTPSzrr:
+ case X86::VINSERTPSZrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
case X86::INSERTPSrm:
case X86::VINSERTPSrm:
- case X86::VINSERTPSzrm:
+ case X86::VINSERTPSZrm:
DestName = getRegName(MI->getOperand(0).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
@@ -507,7 +555,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_MOVDUP(MOVSLDUP, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_MOVDUP(MOVSLDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
@@ -515,7 +564,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_MOVDUP(MOVSHDUP, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_MOVDUP(MOVSHDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
@@ -523,7 +573,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_MOVDUP(MOVDDUP, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_MOVDUP(MOVDDUP, m)
DestName = getRegName(MI->getOperand(0).getReg());
DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
@@ -566,7 +617,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(PALIGNR, rri)
Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_SHUF(PALIGNR, rmi)
Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -576,9 +628,46 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
ShuffleMask);
break;
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z, rri)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rri)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z, rmi)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rmi)
+ CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rmi)
+ Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
+ CASE_AVX512_INS_COMMON(ALIGND, Z, rri)
+ CASE_AVX512_INS_COMMON(ALIGND, Z256, rri)
+ CASE_AVX512_INS_COMMON(ALIGND, Z128, rri)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+ RegForm = true;
+ LLVM_FALLTHROUGH;
+
+ CASE_AVX512_INS_COMMON(ALIGND, Z, rmi)
+ CASE_AVX512_INS_COMMON(ALIGND, Z256, rmi)
+ CASE_AVX512_INS_COMMON(ALIGND, Z128, rmi)
+ Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (MI->getOperand(NumOperands - 1).isImm())
+ DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+ MI->getOperand(NumOperands - 1).getImm(),
+ ShuffleMask);
+ break;
+
CASE_SHUF(PSHUFD, ri)
Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_SHUF(PSHUFD, mi)
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
@@ -589,7 +678,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(PSHUFHW, ri)
Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_SHUF(PSHUFHW, mi)
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
@@ -600,7 +690,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(PSHUFLW, ri)
Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_SHUF(PSHUFLW, mi)
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
@@ -611,7 +702,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PSHUFWri:
Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
case X86::MMX_PSHUFWmi:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
@@ -622,7 +714,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::PSWAPDrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
case X86::PSWAPDrm:
DestName = getRegName(MI->getOperand(0).getReg());
DecodePSWAPMask(MVT::v2i32, ShuffleMask);
@@ -632,7 +725,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKHBWirr:
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(PUNPCKHBW, m)
case X86::MMX_PUNPCKHBWirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -644,7 +738,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKHWDirr:
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(PUNPCKHWD, m)
case X86::MMX_PUNPCKHWDirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -656,7 +751,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKHDQirr:
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(PUNPCKHDQ, m)
case X86::MMX_PUNPCKHDQirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -667,7 +763,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_UNPCK(PUNPCKHQDQ, r)
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(PUNPCKHQDQ, m)
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -678,7 +775,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKLBWirr:
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(PUNPCKLBW, m)
case X86::MMX_PUNPCKLBWirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -690,7 +788,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKLWDirr:
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(PUNPCKLWD, m)
case X86::MMX_PUNPCKLWDirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -702,7 +801,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::MMX_PUNPCKLDQirr:
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(PUNPCKLDQ, m)
case X86::MMX_PUNPCKLDQirm:
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -713,7 +813,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_UNPCK(PUNPCKLQDQ, r)
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(PUNPCKLQDQ, m)
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -723,7 +824,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(SHUFPD, rri)
Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_SHUF(SHUFPD, rmi)
if (MI->getOperand(NumOperands - 1).isImm())
DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
@@ -736,7 +838,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_SHUF(SHUFPS, rri)
Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_SHUF(SHUFPS, rmi)
if (MI->getOperand(NumOperands - 1).isImm())
DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
@@ -749,7 +852,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VSHUF(64X2, r)
Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_VSHUF(64X2, m)
decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i64, 0),
MI->getOperand(NumOperands - 1).getImm(),
@@ -761,7 +865,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VSHUF(32X4, r)
Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_VSHUF(32X4, m)
decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i32, 0),
MI->getOperand(NumOperands - 1).getImm(),
@@ -773,7 +878,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_UNPCK(UNPCKLPD, r)
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(UNPCKLPD, m)
DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -783,7 +889,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_UNPCK(UNPCKLPS, r)
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(UNPCKLPS, m)
DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -793,7 +900,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_UNPCK(UNPCKHPD, r)
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(UNPCKHPD, m)
DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -803,7 +911,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_UNPCK(UNPCKHPS, r)
Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
RegForm = true;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_UNPCK(UNPCKHPS, m)
DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
@@ -812,7 +921,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VPERMILPI(PERMILPS, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_VPERMILPI(PERMILPS, m)
if (MI->getOperand(NumOperands - 1).isImm())
DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
@@ -823,7 +933,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VPERMILPI(PERMILPD, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_VPERMILPI(PERMILPD, m)
if (MI->getOperand(NumOperands - 1).isImm())
DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
@@ -835,7 +946,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPERM2F128rr:
case X86::VPERM2I128rr:
Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
case X86::VPERM2F128rm:
case X86::VPERM2I128rm:
// For instruction comments purpose, assume the 256-bit vector is v4i64.
@@ -849,7 +961,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VPERM(PERMPD, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_VPERM(PERMPD, m)
if (MI->getOperand(NumOperands - 1).isImm())
DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::f64, 0),
@@ -860,7 +973,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_VPERM(PERMQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_VPERM(PERMQ, m)
if (MI->getOperand(NumOperands - 1).isImm())
DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::i64, 0),
@@ -874,7 +988,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VMOVSDZrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
case X86::MOVSDrm:
case X86::VMOVSDrm:
case X86::VMOVSDZrm:
@@ -887,7 +1002,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VMOVSSZrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
case X86::MOVSSrm:
case X86::VMOVSSrm:
case X86::VMOVSSZrm:
@@ -901,15 +1017,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VMOVZPQILo2PQIrr:
case X86::VMOVZPQILo2PQIZrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
case X86::MOVQI2PQIrm:
- case X86::MOVZQI2PQIrm:
- case X86::MOVZPQILo2PQIrm:
case X86::VMOVQI2PQIrm:
case X86::VMOVQI2PQIZrm:
- case X86::VMOVZQI2PQIrm:
- case X86::VMOVZPQILo2PQIrm:
- case X86::VMOVZPQILo2PQIZrm:
DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
@@ -946,15 +1058,59 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VBROADCASTF128:
case X86::VBROADCASTI128:
+ CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm)
DecodeSubVectorBroadcast(MVT::v4f64, MVT::v2f64, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
+ CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm)
+ DecodeSubVectorBroadcast(MVT::v8f64, MVT::v2f64, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm)
+ DecodeSubVectorBroadcast(MVT::v8f64, MVT::v4f64, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm)
+ DecodeSubVectorBroadcast(MVT::v8f32, MVT::v4f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm)
+ DecodeSubVectorBroadcast(MVT::v16f32, MVT::v4f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm)
+ DecodeSubVectorBroadcast(MVT::v16f32, MVT::v8f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
+ DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
+ DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
CASE_PMOVZX(PMOVZXBW, r)
CASE_PMOVZX(PMOVZXBD, r)
CASE_PMOVZX(PMOVZXBQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_PMOVZX(PMOVZXBW, m)
CASE_PMOVZX(PMOVZXBD, m)
CASE_PMOVZX(PMOVZXBQ, m)
@@ -965,7 +1121,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXWD, r)
CASE_PMOVZX(PMOVZXWQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_PMOVZX(PMOVZXWD, m)
CASE_PMOVZX(PMOVZXWQ, m)
DecodeZeroExtendMask(MVT::i16, getZeroExtensionResultType(MI), ShuffleMask);
@@ -974,7 +1131,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXDQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+
CASE_PMOVZX(PMOVZXDQ, m)
DecodeZeroExtendMask(MVT::i32, getZeroExtensionResultType(MI), ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
index 687581b..c6d0d85 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -16,6 +16,11 @@
#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
namespace llvm {
+
+ enum AsmComments {
+ AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX.
+ };
+
class MCInst;
class raw_ostream;
bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 879378f..4443edb 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -253,5 +253,8 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
raw_ostream &O) {
+ if (MI->getOperand(Op).isExpr())
+ return MI->getOperand(Op).getExpr()->print(O, &MAI);
+
O << formatImm(MI->getOperand(Op).getImm() & 0xff);
}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index e77a0dc..e83ec9f 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -76,12 +76,12 @@ class X86AsmBackend : public MCAsmBackend {
public:
X86AsmBackend(const Target &T, StringRef CPU)
: MCAsmBackend(), CPU(CPU),
- MaxNopLength((CPU == "slm" || CPU == "lakemont") ? 7 : 15) {
+ MaxNopLength((CPU == "slm") ? 7 : 15) {
HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
- CPU != "c3" && CPU != "c3-2";
+ CPU != "c3" && CPU != "c3-2" && CPU != "lakemont";
}
unsigned getNumFixupKinds() const override {
@@ -546,8 +546,12 @@ protected:
// .cfi_def_cfa_register %rbp
//
HasFP = true;
- assert(MRI.getLLVMRegNum(Inst.getRegister(), true) ==
- (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!");
+
+ // If the frame pointer is other than esp/rsp, we do not have a way to
+ // generate a compact unwinding representation, so bail out.
+ if (MRI.getLLVMRegNum(Inst.getRegister(), true) !=
+ (Is64Bit ? X86::RBP : X86::EBP))
+ return 0;
// Reset the counts.
memset(SavedRegs, 0, sizeof(SavedRegs));
@@ -837,7 +841,8 @@ public:
MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
const MCRegisterInfo &MRI,
const Triple &TheTriple,
- StringRef CPU) {
+ StringRef CPU,
+ const MCTargetOptions &Options) {
if (TheTriple.isOSBinFormatMachO())
return new DarwinX86_32AsmBackend(T, MRI, CPU);
@@ -855,7 +860,8 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
const MCRegisterInfo &MRI,
const Triple &TheTriple,
- StringRef CPU) {
+ StringRef CPU,
+ const MCTargetOptions &Options) {
if (TheTriple.isOSBinFormatMachO()) {
MachO::CPUSubTypeX86 CS =
StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index b419517..aab5525 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -234,88 +234,114 @@ namespace X86II {
/// their one register operand added to their opcode.
AddRegFrm = 2,
- /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
- /// to specify a destination, which in this case is a register.
- ///
- MRMDestReg = 3,
-
- /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
- /// to specify a destination, which in this case is memory.
- ///
- MRMDestMem = 4,
-
- /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
- /// to specify a source, which in this case is a register.
- ///
- MRMSrcReg = 5,
-
- /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
- /// to specify a source, which in this case is memory.
- ///
- MRMSrcMem = 6,
-
/// RawFrmMemOffs - This form is for instructions that store an absolute
/// memory offset as an immediate with a possible segment override.
- RawFrmMemOffs = 7,
+ RawFrmMemOffs = 3,
/// RawFrmSrc - This form is for instructions that use the source index
/// register SI/ESI/RSI with a possible segment override.
- RawFrmSrc = 8,
+ RawFrmSrc = 4,
/// RawFrmDst - This form is for instructions that use the destination index
/// register DI/EDI/ESI.
- RawFrmDst = 9,
+ RawFrmDst = 5,
/// RawFrmSrc - This form is for instructions that use the source index
/// register SI/ESI/ERI with a possible segment override, and also the
/// destination index register DI/ESI/RDI.
- RawFrmDstSrc = 10,
+ RawFrmDstSrc = 6,
/// RawFrmImm8 - This is used for the ENTER instruction, which has two
/// immediates, the first of which is a 16-bit immediate (specified by
/// the imm encoding) and the second is a 8-bit fixed value.
- RawFrmImm8 = 11,
+ RawFrmImm8 = 7,
/// RawFrmImm16 - This is used for CALL FAR instructions, which have two
/// immediates, the first of which is a 16 or 32-bit immediate (specified by
/// the imm encoding) and the second is a 16-bit fixed value. In the AMD
/// manual, this operand is described as pntr16:32 and pntr16:16
- RawFrmImm16 = 12,
-
- /// MRMX[rm] - The forms are used to represent instructions that use a
- /// Mod/RM byte, and don't use the middle field for anything.
- MRMXr = 14, MRMXm = 15,
+ RawFrmImm16 = 8,
/// MRM[0-7][rm] - These forms are used to represent instructions that use
/// a Mod/RM byte, and use the middle field to hold extended opcode
/// information. In the intel manual these are represented as /0, /1, ...
///
- // First, instructions that operate on a register r/m operand...
- MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3
- MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7
+ /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is memory.
+ ///
+ MRMDestMem = 32,
+
+ /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is memory.
+ ///
+ MRMSrcMem = 33,
+
+ /// MRMSrcMem4VOp3 - This form is used for instructions that encode
+ /// operand 3 with VEX.VVVV and load from memory.
+ ///
+ MRMSrcMem4VOp3 = 34,
+
+ /// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM
+ /// byte to specify the fourth source, which in this case is memory.
+ ///
+ MRMSrcMemOp4 = 35,
+
+ /// MRMXm - This form is used for instructions that use the Mod/RM byte
+ /// to specify a memory source, but doesn't use the middle field.
+ ///
+ MRMXm = 39, // Instruction that uses Mod/RM but not the middle field.
// Next, instructions that operate on a memory r/m operand...
- MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3
- MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7
-
- //// MRM_XX - A mod/rm byte of exactly 0xXX.
- MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35,
- MRM_C4 = 36, MRM_C5 = 37, MRM_C6 = 38, MRM_C7 = 39,
- MRM_C8 = 40, MRM_C9 = 41, MRM_CA = 42, MRM_CB = 43,
- MRM_CC = 44, MRM_CD = 45, MRM_CE = 46, MRM_CF = 47,
- MRM_D0 = 48, MRM_D1 = 49, MRM_D2 = 50, MRM_D3 = 51,
- MRM_D4 = 52, MRM_D5 = 53, MRM_D6 = 54, MRM_D7 = 55,
- MRM_D8 = 56, MRM_D9 = 57, MRM_DA = 58, MRM_DB = 59,
- MRM_DC = 60, MRM_DD = 61, MRM_DE = 62, MRM_DF = 63,
- MRM_E0 = 64, MRM_E1 = 65, MRM_E2 = 66, MRM_E3 = 67,
- MRM_E4 = 68, MRM_E5 = 69, MRM_E6 = 70, MRM_E7 = 71,
- MRM_E8 = 72, MRM_E9 = 73, MRM_EA = 74, MRM_EB = 75,
- MRM_EC = 76, MRM_ED = 77, MRM_EE = 78, MRM_EF = 79,
- MRM_F0 = 80, MRM_F1 = 81, MRM_F2 = 82, MRM_F3 = 83,
- MRM_F4 = 84, MRM_F5 = 85, MRM_F6 = 86, MRM_F7 = 87,
- MRM_F8 = 88, MRM_F9 = 89, MRM_FA = 90, MRM_FB = 91,
- MRM_FC = 92, MRM_FD = 93, MRM_FE = 94, MRM_FF = 95,
+ MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3
+ MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47, // Format /4 /5 /6 /7
+
+ /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is a register.
+ ///
+ MRMDestReg = 48,
+
+ /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is a register.
+ ///
+ MRMSrcReg = 49,
+
+ /// MRMSrcReg4VOp3 - This form is used for instructions that encode
+ /// operand 3 with VEX.VVVV and do not load from memory.
+ ///
+ MRMSrcReg4VOp3 = 50,
+
+ /// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM
+ /// byte to specify the fourth source, which in this case is a register.
+ ///
+ MRMSrcRegOp4 = 51,
+
+ /// MRMXr - This form is used for instructions that use the Mod/RM byte
+ /// to specify a register source, but doesn't use the middle field.
+ ///
+ MRMXr = 55, // Instruction that uses Mod/RM but not the middle field.
+
+ // Instructions that operate on a register r/m operand...
+ MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3
+ MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63, // Format /4 /5 /6 /7
+
+ /// MRM_XX - A mod/rm byte of exactly 0xXX.
+ MRM_C0 = 64, MRM_C1 = 65, MRM_C2 = 66, MRM_C3 = 67,
+ MRM_C4 = 68, MRM_C5 = 69, MRM_C6 = 70, MRM_C7 = 71,
+ MRM_C8 = 72, MRM_C9 = 73, MRM_CA = 74, MRM_CB = 75,
+ MRM_CC = 76, MRM_CD = 77, MRM_CE = 78, MRM_CF = 79,
+ MRM_D0 = 80, MRM_D1 = 81, MRM_D2 = 82, MRM_D3 = 83,
+ MRM_D4 = 84, MRM_D5 = 85, MRM_D6 = 86, MRM_D7 = 87,
+ MRM_D8 = 88, MRM_D9 = 89, MRM_DA = 90, MRM_DB = 91,
+ MRM_DC = 92, MRM_DD = 93, MRM_DE = 94, MRM_DF = 95,
+ MRM_E0 = 96, MRM_E1 = 97, MRM_E2 = 98, MRM_E3 = 99,
+ MRM_E4 = 100, MRM_E5 = 101, MRM_E6 = 102, MRM_E7 = 103,
+ MRM_E8 = 104, MRM_E9 = 105, MRM_EA = 106, MRM_EB = 107,
+ MRM_EC = 108, MRM_ED = 109, MRM_EE = 110, MRM_EF = 111,
+ MRM_F0 = 112, MRM_F1 = 113, MRM_F2 = 114, MRM_F3 = 115,
+ MRM_F4 = 116, MRM_F5 = 117, MRM_F6 = 118, MRM_F7 = 119,
+ MRM_F8 = 120, MRM_F9 = 121, MRM_FA = 122, MRM_FB = 123,
+ MRM_FC = 124, MRM_FD = 125, MRM_FE = 126, MRM_FF = 127,
FormMask = 127,
@@ -403,12 +429,13 @@ namespace X86II {
ImmMask = 15 << ImmShift,
Imm8 = 1 << ImmShift,
Imm8PCRel = 2 << ImmShift,
- Imm16 = 3 << ImmShift,
- Imm16PCRel = 4 << ImmShift,
- Imm32 = 5 << ImmShift,
- Imm32PCRel = 6 << ImmShift,
- Imm32S = 7 << ImmShift,
- Imm64 = 8 << ImmShift,
+ Imm8Reg = 3 << ImmShift,
+ Imm16 = 4 << ImmShift,
+ Imm16PCRel = 5 << ImmShift,
+ Imm32 = 6 << ImmShift,
+ Imm32PCRel = 7 << ImmShift,
+ Imm32S = 8 << ImmShift,
+ Imm64 = 9 << ImmShift,
//===------------------------------------------------------------------===//
// FP Instruction Classification... Zero is non-fp instruction.
@@ -488,39 +515,15 @@ namespace X86II {
VEX_4VShift = VEX_WShift + 1,
VEX_4V = 1ULL << VEX_4VShift,
- /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode
- /// operand 3 with VEX.vvvv.
- VEX_4VOp3Shift = VEX_4VShift + 1,
- VEX_4VOp3 = 1ULL << VEX_4VOp3Shift,
-
- /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
- /// must be encoded in the i8 immediate field. This usually happens in
- /// instructions with 4 operands.
- VEX_I8IMMShift = VEX_4VOp3Shift + 1,
- VEX_I8IMM = 1ULL << VEX_I8IMMShift,
-
/// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
/// instruction uses 256-bit wide registers. This is usually auto detected
/// if a VR256 register is used, but some AVX instructions also have this
/// field marked when using a f256 memory references.
- VEX_LShift = VEX_I8IMMShift + 1,
+ VEX_LShift = VEX_4VShift + 1,
VEX_L = 1ULL << VEX_LShift,
- // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
- // prefix. Usually used for scalar instructions. Needed by disassembler.
- VEX_LIGShift = VEX_LShift + 1,
- VEX_LIG = 1ULL << VEX_LIGShift,
-
- // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
- // with following encoding:
- // - 00 V128
- // - 01 V256
- // - 10 V512
- // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros.
- // this will save 1 tsflag bit
-
// EVEX_K - Set if this instruction requires masking
- EVEX_KShift = VEX_LIGShift + 1,
+ EVEX_KShift = VEX_LShift + 1,
EVEX_K = 1ULL << EVEX_KShift,
// EVEX_Z - Set if this instruction has EVEX.Z field set.
@@ -548,13 +551,8 @@ namespace X86II {
Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift,
- /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
- /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
- MemOp4Shift = Has3DNow0F0FOpcodeShift + 1,
- MemOp4 = 1ULL << MemOp4Shift,
-
/// Explicitly specified rounding control
- EVEX_RCShift = MemOp4Shift + 1,
+ EVEX_RCShift = Has3DNow0F0FOpcodeShift + 1,
EVEX_RC = 1ULL << EVEX_RCShift
};
@@ -575,7 +573,8 @@ namespace X86II {
switch (TSFlags & X86II::ImmMask) {
default: llvm_unreachable("Unknown immediate size");
case X86II::Imm8:
- case X86II::Imm8PCRel: return 1;
+ case X86II::Imm8PCRel:
+ case X86II::Imm8Reg: return 1;
case X86II::Imm16:
case X86II::Imm16PCRel: return 2;
case X86II::Imm32:
@@ -595,6 +594,7 @@ namespace X86II {
case X86II::Imm32PCRel:
return true;
case X86II::Imm8:
+ case X86II::Imm8Reg:
case X86II::Imm16:
case X86II::Imm32:
case X86II::Imm32S:
@@ -612,6 +612,7 @@ namespace X86II {
return true;
case X86II::Imm8:
case X86II::Imm8PCRel:
+ case X86II::Imm8Reg:
case X86II::Imm16:
case X86II::Imm16PCRel:
case X86II::Imm32:
@@ -626,26 +627,25 @@ namespace X86II {
/// in this instruction.
/// If this is a two-address instruction,skip one of the register operands.
/// FIXME: This should be handled during MCInst lowering.
- inline int getOperandBias(const MCInstrDesc& Desc)
+ inline unsigned getOperandBias(const MCInstrDesc& Desc)
{
unsigned NumOps = Desc.getNumOperands();
- unsigned CurOp = 0;
if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
- ++CurOp;
- else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
- Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+ return 1;
+ if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
// Special case for AVX-512 GATHER with 2 TIED_TO operands
// Skip the first 2 operands: dst, mask_wb
- CurOp += 2;
- else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
- Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
+ return 2;
+ if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+ Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
// Special case for GATHER with 2 TIED_TO operands
// Skip the first 2 operands: dst, mask_wb
- CurOp += 2;
- else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+ return 2;
+ if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
// SCATTER
- ++CurOp;
- return CurOp;
+ return 1;
+ return 0;
}
/// getMemoryOperandNo - The function returns the MCInst operand # for the
@@ -658,7 +658,6 @@ namespace X86II {
///
inline int getMemoryOperandNo(uint64_t TSFlags) {
bool HasVEX_4V = TSFlags & X86II::VEX_4V;
- bool HasMemOp4 = TSFlags & X86II::MemOp4;
bool HasEVEX_K = TSFlags & X86II::EVEX_K;
switch (TSFlags & X86II::FormMask) {
@@ -666,8 +665,6 @@ namespace X86II {
case X86II::Pseudo:
case X86II::RawFrm:
case X86II::AddRegFrm:
- case X86II::MRMDestReg:
- case X86II::MRMSrcReg:
case X86II::RawFrmImm8:
case X86II::RawFrmImm16:
case X86II::RawFrmMemOffs:
@@ -680,7 +677,17 @@ namespace X86II {
case X86II::MRMSrcMem:
// Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
// mask register.
- return 1 + HasVEX_4V + HasMemOp4 + HasEVEX_K;
+ return 1 + HasVEX_4V + HasEVEX_K;
+ case X86II::MRMSrcMem4VOp3:
+ // Skip registers encoded in reg.
+ return 1 + HasEVEX_K;
+ case X86II::MRMSrcMemOp4:
+ // Skip registers encoded in reg, VEX_VVVV, and I8IMM.
+ return 3;
+ case X86II::MRMDestReg:
+ case X86II::MRMSrcReg:
+ case X86II::MRMSrcReg4VOp3:
+ case X86II::MRMSrcRegOp4:
case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
@@ -723,12 +730,9 @@ namespace X86II {
/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
/// higher) register? e.g. r8, xmm8, xmm13, etc.
inline bool isX86_64ExtendedReg(unsigned RegNo) {
- if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM15) ||
- (RegNo >= X86::XMM24 && RegNo <= X86::XMM31) ||
- (RegNo >= X86::YMM8 && RegNo <= X86::YMM15) ||
- (RegNo >= X86::YMM24 && RegNo <= X86::YMM31) ||
- (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM15) ||
- (RegNo >= X86::ZMM24 && RegNo <= X86::ZMM31))
+ if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM31) ||
+ (RegNo >= X86::YMM8 && RegNo <= X86::YMM31) ||
+ (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM31))
return true;
switch (RegNo) {
@@ -743,6 +747,8 @@ namespace X86II {
case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11:
case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15:
+ case X86::DR8: case X86::DR9: case X86::DR10: case X86::DR11:
+ case X86::DR12: case X86::DR13: case X86::DR14: case X86::DR15:
return true;
}
return false;
@@ -761,6 +767,16 @@ namespace X86II {
return (reg == X86::SPL || reg == X86::BPL ||
reg == X86::SIL || reg == X86::DIL);
}
+
+ /// isKMasked - Is this a masked instruction.
+ inline bool isKMasked(uint64_t TSFlags) {
+ return (TSFlags & X86II::EVEX_K) != 0;
+ }
+
+ /// isKMergedMasked - Is this a merge masked instruction.
+ inline bool isKMergeMasked(uint64_t TSFlags) {
+ return isKMasked(TSFlags) && (TSFlags & X86II::EVEX_Z) == 0;
+ }
}
} // end namespace llvm;
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index b7c56ce..48a1d8f 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -31,8 +31,7 @@ static cl::opt<AsmWriterFlavorTy>
AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
cl::desc("Choose style of code to emit from X86 backend:"),
cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"),
- clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
- clEnumValEnd));
+ clEnumValN(Intel, "intel", "Emit Intel-style assembly")));
static cl::opt<bool>
MarkedJTDataRegions("mark-data-regions", cl::init(true),
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 96c2e81..8045e7c 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -81,7 +81,8 @@ public:
MI.getOperand(OpNum).getReg());
}
- bool isX86_64ExtendedReg(const MCInst &MI, unsigned OpNum) const {
+ // Does this register require a bit to be set in REX prefix.
+ bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const {
return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
}
@@ -602,8 +603,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
uint64_t Encoding = TSFlags & X86II::EncodingMask;
bool HasEVEX_K = TSFlags & X86II::EVEX_K;
bool HasVEX_4V = TSFlags & X86II::VEX_4V;
- bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
- bool HasMemOp4 = TSFlags & X86II::MemOp4;
bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
// VEX_R: opcode externsion equivalent to REX.R in
@@ -745,11 +744,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// src1(ModR/M), MemAddr
// src1(ModR/M), src2(VEX_4V), MemAddr
// src1(ModR/M), MemAddr, imm8
- // src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+ // src1(ModR/M), MemAddr, src2(Imm[7:4])
//
// FMA4:
- // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
- // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+ // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
VEX_R = ~(RegEnc >> 3) & 1;
EVEX_R2 = ~(RegEnc >> 4) & 1;
@@ -770,13 +768,34 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
- if (HasVEX_4VOp3)
- // Instruction format for 4VOp3:
- // src1(ModR/M), MemAddr, src3(VEX_4V)
- // CurOp points to start of the MemoryOperand,
- // it skips TIED_TO operands if exist, then increments past src1.
- // CurOp + X86::AddrNumOperands will point to src3.
- VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf;
+ break;
+ }
+ case X86II::MRMSrcMem4VOp3: {
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), MemAddr, src3(VEX_4V)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
+
+ VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf;
+ break;
+ }
+ case X86II::MRMSrcMemOp4: {
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+
+ unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+ VEX_B = ~(BaseRegEnc >> 3) & 1;
+ unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ VEX_X = ~(IndexRegEnc >> 3) & 1;
break;
}
case X86II::MRM0m: case X86II::MRM1m:
@@ -803,13 +822,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
}
case X86II::MRMSrcReg: {
// MRMSrcReg instructions forms:
- // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+ // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
// dst(ModR/M), src1(ModR/M)
// dst(ModR/M), src1(ModR/M), imm8
//
// FMA4:
- // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
- // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
VEX_R = ~(RegEnc >> 3) & 1;
EVEX_R2 = ~(RegEnc >> 4) & 1;
@@ -823,14 +841,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
EVEX_V2 = ~(VRegEnc >> 4) & 1;
}
- if (HasMemOp4) // Skip second register source (encoded in I8IMM)
- CurOp++;
-
RegEnc = getX86RegEncoding(MI, CurOp++);
VEX_B = ~(RegEnc >> 3) & 1;
VEX_X = ~(RegEnc >> 4) & 1;
- if (HasVEX_4VOp3)
- VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf;
+
if (EVEX_b) {
if (HasEVEX_RC) {
unsigned RcOperand = NumOps-1;
@@ -841,6 +855,34 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
}
break;
}
+ case X86II::MRMSrcReg4VOp3: {
+ // Instruction format for 4VOp3:
+ // src1(ModR/M), src2(ModR/M), src3(VEX_4V)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+
+ VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf;
+ break;
+ }
+ case X86II::MRMSrcRegOp4: {
+ // dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+
+ unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_4V = ~VRegEnc & 0xf;
+
+ // Skip second register source (encoded in Imm[7:4])
+ ++CurOp;
+
+ RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_B = ~(RegEnc >> 3) & 1;
+ VEX_X = ~(RegEnc >> 4) & 1;
+ break;
+ }
case X86II::MRMDestReg: {
// MRMDestReg instructions forms:
// dst(ModR/M), src(ModR/M)
@@ -976,52 +1018,51 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
unsigned Reg = MO.getReg();
if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
UsesHighByteReg = true;
- if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
- // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
- // that returns non-zero.
- REX |= 0x40; // REX fixed encoding prefix
- break;
+ if (X86II::isX86_64NonExtLowByteReg(Reg))
+ // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+ // that returns non-zero.
+ REX |= 0x40; // REX fixed encoding prefix
}
switch (TSFlags & X86II::FormMask) {
case X86II::AddRegFrm:
- REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
break;
case X86II::MRMSrcReg:
- REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R
- REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
break;
case X86II::MRMSrcMem: {
- REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R
- REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
- REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
CurOp += X86::AddrNumOperands;
break;
}
case X86II::MRMDestReg:
- REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B
- REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
break;
case X86II::MRMDestMem:
- REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
- REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
CurOp += X86::AddrNumOperands;
- REX |= isX86_64ExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
break;
case X86II::MRMXm:
case X86II::MRM0m: case X86II::MRM1m:
case X86II::MRM2m: case X86II::MRM3m:
case X86II::MRM4m: case X86II::MRM5m:
case X86II::MRM6m: case X86II::MRM7m:
- REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
- REX |= isX86_64ExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
break;
case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
case X86II::MRM4r: case X86II::MRM5r:
case X86II::MRM6r: case X86II::MRM7r:
- REX |= isX86_64ExtendedReg(MI, CurOp++) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
break;
}
if (REX && UsesHighByteReg)
@@ -1133,10 +1174,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
// It uses the VEX.VVVV field?
bool HasVEX_4V = TSFlags & X86II::VEX_4V;
- bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
- bool HasMemOp4 = TSFlags & X86II::MemOp4;
- bool HasVEX_I8IMM = TSFlags & X86II::VEX_I8IMM;
- assert((!HasMemOp4 || HasVEX_I8IMM) && "MemOp4 should imply VEX_I8IMM");
+ bool HasVEX_I8Reg = (TSFlags & X86II::ImmMask) == X86II::Imm8Reg;
// It uses the EVEX.aaa field?
bool HasEVEX_K = TSFlags & X86II::EVEX_K;
@@ -1312,21 +1350,42 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
- if (HasMemOp4) // Capture 2nd src (which is encoded in I8IMM)
- I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
-
EmitRegModRMByte(MI.getOperand(SrcRegNum),
GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
CurOp = SrcRegNum + 1;
- if (HasVEX_4VOp3)
- ++CurOp;
- if (!HasMemOp4 && HasVEX_I8IMM)
+ if (HasVEX_I8Reg)
I8RegNum = getX86RegEncoding(MI, CurOp++);
// do not count the rounding control operand
if (HasEVEX_RC)
--NumOps;
break;
}
+ case X86II::MRMSrcReg4VOp3: {
+ EmitByte(BaseOpcode, CurByte, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ EmitRegModRMByte(MI.getOperand(SrcRegNum),
+ GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ CurOp = SrcRegNum + 1;
+ ++CurOp; // Encoded in VEX.VVVV
+ break;
+ }
+ case X86II::MRMSrcRegOp4: {
+ EmitByte(BaseOpcode, CurByte, OS);
+ unsigned SrcRegNum = CurOp + 1;
+
+ // Skip 1st src (which is encoded in VEX_VVVV)
+ ++SrcRegNum;
+
+ // Capture 2nd src (which is encoded in Imm[7:4])
+ assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
+ I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
+
+ EmitRegModRMByte(MI.getOperand(SrcRegNum),
+ GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ CurOp = SrcRegNum + 1;
+ break;
+ }
case X86II::MRMSrcMem: {
unsigned FirstMemOp = CurOp+1;
@@ -1336,20 +1395,42 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (HasVEX_4V)
++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
- if (HasMemOp4) // Capture second register source (encoded in I8IMM)
- I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
-
EmitByte(BaseOpcode, CurByte, OS);
emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
TSFlags, Rex, CurByte, OS, Fixups, STI);
CurOp = FirstMemOp + X86::AddrNumOperands;
- if (HasVEX_4VOp3)
- ++CurOp;
- if (!HasMemOp4 && HasVEX_I8IMM)
+ if (HasVEX_I8Reg)
I8RegNum = getX86RegEncoding(MI, CurOp++);
break;
}
+ case X86II::MRMSrcMem4VOp3: {
+ unsigned FirstMemOp = CurOp+1;
+
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, Rex, CurByte, OS, Fixups, STI);
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+ ++CurOp; // Encoded in VEX.VVVV.
+ break;
+ }
+ case X86II::MRMSrcMemOp4: {
+ unsigned FirstMemOp = CurOp+1;
+
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+
+ // Capture second register source (encoded in Imm[7:4])
+ assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
+ I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
+
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, Rex, CurByte, OS, Fixups, STI);
+ CurOp = FirstMemOp + X86::AddrNumOperands;
+ break;
+ }
case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
@@ -1410,7 +1491,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
break;
}
- if (HasVEX_I8IMM) {
+ if (HasVEX_I8Reg) {
// The last source register of a 4 operand instruction in AVX is encoded
// in bits[7:4] of a immediate byte.
assert(I8RegNum < 16 && "Register encoding out of range");
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 311a8d6..22cb0fa 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -234,7 +234,7 @@ static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
// Force static initialization.
extern "C" void LLVMInitializeX86TargetMC() {
- for (Target *T : {&TheX86_32Target, &TheX86_64Target}) {
+ for (Target *T : {&getTheX86_32Target(), &getTheX86_64Target()}) {
// Register the MC asm info.
RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
@@ -268,9 +268,9 @@ extern "C" void LLVMInitializeX86TargetMC() {
}
// Register the asm backend.
- TargetRegistry::RegisterMCAsmBackend(TheX86_32Target,
+ TargetRegistry::RegisterMCAsmBackend(getTheX86_32Target(),
createX86_32AsmBackend);
- TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
+ TargetRegistry::RegisterMCAsmBackend(getTheX86_64Target(),
createX86_64AsmBackend);
}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index ca4f0d3..f73e734 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -27,13 +27,15 @@ class MCObjectWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class MCRelocationInfo;
+class MCTargetOptions;
class Target;
class Triple;
class StringRef;
class raw_ostream;
class raw_pwrite_stream;
-extern Target TheX86_32Target, TheX86_64Target;
+Target &getTheX86_32Target();
+Target &getTheX86_64Target();
/// Flavour of dwarf regnumbers
///
@@ -69,9 +71,11 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
MCContext &Ctx);
MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU);
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const Triple &TT, StringRef CPU);
+ const Triple &TT, StringRef CPU,
+ const MCTargetOptions &Options);
/// Construct an X86 Windows COFF machine code streamer which will generate
/// PE/COFF format object files.
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index fceb083..d2654fc 100644
--- a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -11,12 +11,19 @@
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
-Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+Target &llvm::getTheX86_32Target() {
+ static Target TheX86_32Target;
+ return TheX86_32Target;
+}
+Target &llvm::getTheX86_64Target() {
+ static Target TheX86_64Target;
+ return TheX86_64Target;
+}
extern "C" void LLVMInitializeX86TargetInfo() {
- RegisterTarget<Triple::x86, /*HasJIT=*/true>
- X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+ RegisterTarget<Triple::x86, /*HasJIT=*/true> X(
+ getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above");
- RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
- Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+ RegisterTarget<Triple::x86_64, /*HasJIT=*/true> Y(
+ getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64");
}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 18f7167..1be5aec 100644
--- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -136,7 +136,7 @@ void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
void DecodePALIGNRMask(MVT VT, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
- unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+ unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumLaneElts = NumElts / NumLanes;
@@ -151,6 +151,16 @@ void DecodePALIGNRMask(MVT VT, unsigned Imm,
}
}
+void DecodeVALIGNMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ int NumElts = VT.getVectorNumElements();
+ // Not all bits of the immediate are used so mask it.
+ assert(isPowerOf2_32(NumElts) && "NumElts should be power of 2");
+ Imm = Imm & (NumElts - 1);
+ for (int i = 0; i != NumElts; ++i)
+ ShuffleMask.push_back(i + Imm);
+}
+
/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
@@ -538,10 +548,11 @@ void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
unsigned VecSize = VT.getSizeInBits();
unsigned EltSize = VT.getScalarSizeInBits();
unsigned NumLanes = VecSize / 128;
- unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes;
- assert((VecSize == 128 || VecSize == 256) &&
- "Unexpected vector size");
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumEltsPerLane = NumElts / NumLanes;
+ assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size");
assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+ assert((NumElts == RawMask.size()) && "Unexpected mask size");
for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
// VPERMIL2 Operation.
@@ -562,14 +573,15 @@ void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
continue;
}
- unsigned Index = i & ~(NumEltsPerLane - 1);
+ int Index = i & ~(NumEltsPerLane - 1);
if (EltSize == 64)
Index += (Selector >> 1) & 0x1;
else
Index += Selector & 0x3;
- unsigned SrcOffset = (Selector >> 2) & 1;
- ShuffleMask.push_back((int)(SrcOffset + Index));
+ int Src = (Selector >> 2) & 0x1;
+ Index += Src * NumElts;
+ ShuffleMask.push_back(Index);
}
}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
index dc21c19..17619d0 100644
--- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -55,6 +55,8 @@ void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVALIGNMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
/// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
index 23d6c71..2cb80a4 100644
--- a/contrib/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -87,6 +87,13 @@ FunctionPass *createX86ExpandPseudoPass();
FunctionPass *createX86FixupBWInsts();
void initializeFixupBWInstPassPass(PassRegistry &);
+
+/// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX
+/// encoding when possible in order to reduce code size.
+FunctionPass *createX86EvexToVexInsts();
+
+void initializeEvexToVexInstPassPass(PassRegistry &);
+
} // End llvm namespace
#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
index 8267a84..83a23d4 100644
--- a/contrib/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -99,6 +99,8 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
"Bit testing of memory is slow">;
def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
"SHLD instruction is slow">;
+def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
+ "PMULLD instruction is slow">;
// FIXME: This should not apply to CPUs that do not have SSE.
def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
"IsUAMem16Slow", "true",
@@ -141,8 +143,8 @@ def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
"Enable AVX-512 Vector Length eXtensions",
[FeatureAVX512]>;
def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
- "Enable AVX-512 Vector Bit Manipulation Instructions",
- [FeatureAVX512]>;
+ "Enable AVX-512 Vector Byte Manipulation Instructions",
+ [FeatureBWI]>;
def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
"Enable AVX-512 Integer Fused Multiple-Add",
[FeatureAVX512]>;
@@ -207,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
"HasSlowDivide32", "true",
"Use 8-bit divide for positive values less than 256">;
-def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
"HasSlowDivide64", "true",
- "Use 16-bit divide for positive values less than 65536">;
+ "Use 32-bit divide for positive values less than 2^32">;
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
@@ -249,6 +251,25 @@ def FeatureSoftFloat
def FeatureFastPartialYMMWrite
: SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
"true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+ : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+ "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+ : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+ "true", "Vector SQRT is fast (disable Newton-Raphson)">;
+// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
+// be used to replace test/set sequences.
+def FeatureFastLZCNT
+ : SubtargetFeature<
+ "fast-lzcnt", "HasFastLZCNT", "true",
+ "LZCNT instructions are as fast as most simple integer ops">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -384,6 +405,7 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
FeatureSlowLEA,
FeatureSlowIncDec,
FeatureSlowBTMem,
+ FeatureSlowPMULLD,
FeatureLAHFSAHF
]>;
def : SilvermontProc<"silvermont">;
@@ -439,10 +461,12 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureAES,
+ FeatureSlowDivide64,
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureFastScalarFSQRT
]>;
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -500,7 +524,8 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
FeatureXSAVEC,
FeatureXSAVES,
FeatureSGX,
- FeatureCLFLUSHOPT
+ FeatureCLFLUSHOPT,
+ FeatureFastVectorFSQRT
]>;
// FIXME: define SKL model
@@ -631,6 +656,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
FeatureF16C,
FeatureMOVBE,
FeatureLZCNT,
+ FeatureFastLZCNT,
FeaturePOPCNT,
FeatureXSAVE,
FeatureXSAVEOPT,
@@ -729,11 +755,48 @@ def : Proc<"bdver4", [
FeatureTBM,
FeatureFMA,
FeatureXSAVEOPT,
+ FeatureSlowSHLD,
FeatureFSGSBase,
FeatureLAHFSAHF,
FeatureMWAITX
]>;
+// TODO: The scheduler model falls to BTVER2 model.
+// The znver1 model has to be put in place.
+// Zen
+def: ProcessorModel<"znver1", BtVer2Model, [
+ FeatureADX,
+ FeatureAES,
+ FeatureAVX2,
+ FeatureBMI,
+ FeatureBMI2,
+ FeatureCLFLUSHOPT,
+ FeatureCMPXCHG16B,
+ FeatureF16C,
+ FeatureFMA,
+ FeatureFSGSBase,
+ FeatureFXSR,
+ FeatureFastLZCNT,
+ FeatureLAHFSAHF,
+ FeatureLZCNT,
+ FeatureMMX,
+ FeatureMOVBE,
+ FeatureMWAITX,
+ FeaturePCLMUL,
+ FeaturePOPCNT,
+ FeaturePRFCHW,
+ FeatureRDRAND,
+ FeatureRDSEED,
+ FeatureSHA,
+ FeatureSMAP,
+ FeatureSSE4A,
+ FeatureSlowSHLD,
+ FeatureX87,
+ FeatureXSAVE,
+ FeatureXSAVEC,
+ FeatureXSAVEOPT,
+ FeatureXSAVES]>;
+
def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 67e51f1..e1825ca 100644
--- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -57,10 +57,10 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SetupMachineFunction(MF);
if (Subtarget->isTargetCOFF()) {
- bool Intrn = MF.getFunction()->hasInternalLinkage();
+ bool Local = MF.getFunction()->hasLocalLinkage();
OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
- OutStreamer->EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC
- : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+ OutStreamer->EmitCOFFSymbolStorageClass(
+ Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL);
OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
<< COFF::SCT_COMPLEX_TYPE_SHIFT);
OutStreamer->EndCOFFSymbolDef();
@@ -70,7 +70,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
EmitFunctionBody();
// Emit the XRay table for this function.
- EmitXRayTable();
+ emitXRayTable();
// We didn't modify anything.
return false;
@@ -627,11 +627,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
raw_string_ostream FlagsOS(Flags);
for (const auto &Function : M)
- TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function, *Mang);
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function);
for (const auto &Global : M.globals())
- TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global, *Mang);
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global);
for (const auto &Alias : M.aliases())
- TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias, *Mang);
+ TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias);
FlagsOS.flush();
@@ -656,6 +656,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
// Force static initialization.
extern "C" void LLVMInitializeX86AsmPrinter() {
- RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
- RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+ RegisterAsmPrinter<X86AsmPrinter> X(getTheX86_32Target());
+ RegisterAsmPrinter<X86AsmPrinter> Y(getTheX86_64Target());
}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
index dcb7b5a..6798253 100644
--- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -71,27 +71,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
StackMapShadowTracker SMShadowTracker;
- // This describes the kind of sled we're storing in the XRay table.
- enum class SledKind : uint8_t {
- FUNCTION_ENTER = 0,
- FUNCTION_EXIT = 1,
- TAIL_CALL = 2,
- };
-
- // The table will contain these structs that point to the sled, the function
- // containing the sled, and what kind of sled (and whether they should always
- // be instrumented).
- struct XRayFunctionEntry {
- const MCSymbol *Sled;
- const MCSymbol *Function;
- SledKind Kind;
- bool AlwaysInstrument;
- const class Function *Fn;
- };
-
- // All the sleds to be emitted.
- std::vector<XRayFunctionEntry> Sleds;
-
// All instructions emitted by the X86AsmPrinter should use this helper
// method.
//
@@ -117,15 +96,13 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
// function.
void EmitXRayTable();
- // Helper function to record a given XRay sled.
- void recordSled(MCSymbol *Sled, const MachineInstr &MI, SledKind Kind);
public:
explicit X86AsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {}
- const char *getPassName() const override {
- return "X86 Assembly / Object Emitter";
+ StringRef getPassName() const override {
+ return "X86 Assembly Printer";
}
const X86Subtarget &getSubtarget() const { return *Subtarget; }
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index 8f6fc40..844c66d 100644
--- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -100,7 +100,7 @@ private:
const X86RegisterInfo &RegInfo,
DenseSet<unsigned int> &UsedRegs);
- const char *getPassName() const override { return "X86 Optimize Call Frame"; }
+ StringRef getPassName() const override { return "X86 Optimize Call Frame"; }
const TargetInstrInfo *TII;
const X86FrameLowering *TFL;
@@ -134,7 +134,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
// in the compact unwind encoding that Darwin uses. So, bail if there
// is a danger of that being generated.
if (STI->isTargetDarwin() &&
- (!MF.getMMI().getLandingPads().empty() ||
+ (!MF.getLandingPads().empty() ||
(MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
return false;
@@ -180,7 +180,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
// This transformation is always a win when we do not expect to have
// a reserved call frame. Under other circumstances, it may be either
// a win or a loss, and requires a heuristic.
- bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
+ bool CannotReserveFrame = MF.getFrameInfo().hasVarSizedObjects();
if (CannotReserveFrame)
return true;
@@ -230,7 +230,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size");
Log2SlotSize = Log2_32(SlotSize);
- if (!isLegal(MF))
+ if (skipFunction(*MF.getFunction()) || !isLegal(MF))
return false;
unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
@@ -345,10 +345,10 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
return;
}
- // For globals in PIC mode, we can have some LEAs here.
- // Ignore them, they don't bother us.
+ // Skip over DEBUG_VALUE.
+ // For globals in PIC mode, we can have some LEAs here. Skip them as well.
// TODO: Extend this to something that covers more cases.
- while (I->getOpcode() == X86::LEA32r)
+ while (I->getOpcode() == X86::LEA32r || I->isDebugValue())
++I;
unsigned StackPtr = RegInfo.getStackRegister();
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
new file mode 100644
index 0000000..5ae4962
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -0,0 +1,46 @@
+//===-- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86CallLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
+ : CallLowering(&TLI) {}
+
+bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, unsigned VReg) const {
+ // TODO: handle functions returning non-void values.
+ if (Val)
+ return false;
+
+ MIRBuilder.buildInstr(X86::RET).addImm(0);
+
+ return true;
+}
+
+bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<unsigned> VRegs) const {
+ // TODO: handle functions with one or more arguments.
+ return F.arg_empty();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm/lib/Target/X86/X86CallLowering.h
new file mode 100644
index 0000000..f2672f0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.h
@@ -0,0 +1,39 @@
+//===-- llvm/lib/Target/X86/X86CallLowering.h - Call lowering -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING
+#define LLVM_LIB_TARGET_X86_X86CALLLOWERING
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class Function;
+class MachineIRBuilder;
+class X86TargetLowering;
+class Value;
+
+class X86CallLowering : public CallLowering {
+public:
+ X86CallLowering(const X86TargetLowering &TLI);
+
+ bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+ unsigned VReg) const override;
+
+ bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ ArrayRef<unsigned> VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
new file mode 100644
index 0000000..c96e76b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -0,0 +1,208 @@
+//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of custom routines for the X86
+// Calling Convention that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // List of GPR registers that are available to store values in regcall
+ // calling convention.
+ static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
+ X86::ESI};
+
+ // The vector will save all the available registers for allocation.
+ SmallVector<unsigned, 5> AvailableRegs;
+
+ // searching for the available registers.
+ for (auto Reg : RegList) {
+ if (!State.isAllocated(Reg))
+ AvailableRegs.push_back(Reg);
+ }
+
+ const size_t RequiredGprsUponSplit = 2;
+ if (AvailableRegs.size() < RequiredGprsUponSplit)
+ return false; // Not enough free registers - continue the search.
+
+ // Allocating the available registers.
+ for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
+
+ // Marking the register as located.
+ unsigned Reg = State.AllocateReg(AvailableRegs[I]);
+
+ // Since we previously made sure that 2 registers are available
+ // we expect that a real register number will be returned.
+ assert(Reg && "Expecting a register will be available");
+
+ // Assign the value to the allocated register
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ }
+
+ // Successful in allocating regsiters - stop scanning next rules.
+ return true;
+}
+
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
+ if (ValVT.is512BitVector()) {
+ static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
+ X86::ZMM3, X86::ZMM4, X86::ZMM5};
+ return makeArrayRef(std::begin(RegListZMM), std::end(RegListZMM));
+ }
+
+ if (ValVT.is256BitVector()) {
+ static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
+ X86::YMM3, X86::YMM4, X86::YMM5};
+ return makeArrayRef(std::begin(RegListYMM), std::end(RegListYMM));
+ }
+
+ static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+ X86::XMM3, X86::XMM4, X86::XMM5};
+ return makeArrayRef(std::begin(RegListXMM), std::end(RegListXMM));
+}
+
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
+ static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
+ return makeArrayRef(std::begin(RegListGPR), std::end(RegListGPR));
+}
+
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
+ MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags,
+ CCState &State) {
+
+ ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
+ bool Is64bit = static_cast<const X86Subtarget &>(
+ State.getMachineFunction().getSubtarget())
+ .is64Bit();
+
+ for (auto Reg : RegList) {
+ // If the register is not marked as allocated - assign to it.
+ if (!State.isAllocated(Reg)) {
+ unsigned AssigedReg = State.AllocateReg(Reg);
+ assert(AssigedReg == Reg && "Expecting a valid register allocation");
+ State.addLoc(
+ CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
+ return true;
+ }
+ // If the register is marked as shadow allocated - assign to it.
+ if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ }
+
+ llvm_unreachable("Clang should ensure that hva marked vectors will have "
+ "an available register.");
+ return false;
+}
+
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // On the second pass, go through the HVAs only.
+ if (ArgFlags.isSecArgPass()) {
+ if (ArgFlags.isHva())
+ return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags, State);
+ return true;
+ }
+
+ // Process only vector types as defined by vectorcall spec:
+ // "A vector type is either a floating-point type, for example,
+ // a float or double, or an SIMD vector type, for example, __m128 or __m256".
+ if (!(ValVT.isFloatingPoint() ||
+ (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+ // If R9 was already assigned it means that we are after the fourth element
+ // and because this is not an HVA / Vector type, we need to allocate
+ // shadow XMM register.
+ if (State.isAllocated(X86::R9)) {
+ // Assign shadow XMM register.
+ (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
+ }
+
+ return false;
+ }
+
+ if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
+ // Assign shadow GPR register.
+ (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
+
+ // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
+ if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+ // In Vectorcall Calling convention, additional shadow stack can be
+ // created on top of the basic 32 bytes of win64.
+ // It can happen if the fifth or sixth argument is vector type or HVA.
+ // At that case for each argument a shadow stack of 8 bytes is allocated.
+ if (Reg == X86::XMM4 || Reg == X86::XMM5)
+ State.AllocateStack(8, 8);
+
+ if (!ArgFlags.isHva()) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true; // Allocated a register - Stop the search.
+ }
+ }
+ }
+
+ // If this is an HVA - Stop the search,
+ // otherwise continue the search.
+ return ArgFlags.isHva();
+}
+
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // On the second pass, go through the HVAs only.
+ if (ArgFlags.isSecArgPass()) {
+ if (ArgFlags.isHva())
+ return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+ ArgFlags, State);
+ return true;
+ }
+
+ // Process only vector types as defined by vectorcall spec:
+ // "A vector type is either a floating point type, for example,
+ // a float or double, or an SIMD vector type, for example, __m128 or __m256".
+ if (!(ValVT.isFloatingPoint() ||
+ (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+ return false;
+ }
+
+ if (ArgFlags.isHva())
+ return true; // If this is an HVA - Stop the search.
+
+ // Assign XMM register.
+ if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+ State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+
+ // In case we did not find an available XMM register for a vector -
+ // pass it indirectly.
+ // It is similar to CCPassIndirect, with the addition of inreg.
+ if (!ValVT.isFloatingPoint()) {
+ LocVT = MVT::i32;
+ LocInfo = CCValAssign::Indirect;
+ ArgFlags.setInReg();
+ }
+
+ return false; // No register was assigned - Continue the search.
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h
index a08160f..c49a683 100644
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.h
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h
@@ -21,18 +21,32 @@
namespace llvm {
-inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
- MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags,
- CCState &State) {
- // Similar to CCPassIndirect, with the addition of inreg.
- LocVT = MVT::i32;
- LocInfo = CCValAssign::Indirect;
- ArgFlags.setInReg();
- return false; // Continue the search, but now for i32.
-}
-
+/// When regcall calling convention compiled to 32 bit arch, special treatment
+/// is required for 64 bit masks.
+/// The value should be assigned to two GPRs.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State);
inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
index 4cb62b5..cf7bc98 100644
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -18,6 +18,179 @@ class CCIfSubtarget<string F, CCAction A>
"(State.getMachineFunction().getSubtarget()).", F),
A>;
+// Register classes for RegCall
+class RC_X86_RegCall {
+ list<Register> GPR_8 = [];
+ list<Register> GPR_16 = [];
+ list<Register> GPR_32 = [];
+ list<Register> GPR_64 = [];
+ list<Register> FP_CALL = [FP0];
+ list<Register> FP_RET = [FP0, FP1];
+ list<Register> XMM = [];
+ list<Register> YMM = [];
+ list<Register> ZMM = [];
+}
+
+// RegCall register classes for 32 bits
+def RC_X86_32_RegCall : RC_X86_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL];
+ let GPR_16 = [AX, CX, DX, DI, SI];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI];
+ let GPR_64 = [RAX]; ///< Not actually used, but AssignToReg can't handle []
+ ///< \todo Fix AssignToReg to enable empty lists
+ let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7];
+ let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7];
+ let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7];
+}
+
+class RC_X86_64_RegCall : RC_X86_RegCall {
+ let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15];
+ let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15];
+ let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7,
+ ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM14, ZMM15];
+}
+
+def RC_X86_64_RegCall_Win : RC_X86_64_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R10B, R11B, R12B, R14B, R15B];
+ let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R10W, R11W, R12W, R14W, R15W];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R10D, R11D, R12D, R14D, R15D];
+ let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15];
+}
+
+def RC_X86_64_RegCall_SysV : RC_X86_64_RegCall {
+ let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R12B, R13B, R14B, R15B];
+ let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R12W, R13W, R14W, R15W];
+ let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R12D, R13D, R14D, R15D];
+ let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R12, R13, R14, R15];
+}
+
+// X86-64 Intel regcall calling convention.
+multiclass X86_RegCall_base<RC_X86_RegCall RC> {
+def CC_#NAME : CallingConv<[
+ // Handles byval parameters.
+ CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>,
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i1/i8/i16 arguments to i32.
+ CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+ // Promote v8i1/v16i1/v32i1 arguments to i32.
+ CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>,
+
+ // bool, char, int, enum, long, pointer --> GPR
+ CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
+
+ // long long, __int64 --> GPR
+ CCIfType<[i64], CCAssignToReg<RC.GPR_64>>,
+
+ // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32)
+ CCIfType<[v64i1], CCPromoteToType<i64>>,
+ CCIfSubtarget<"is64Bit()", CCIfType<[i64],
+ CCAssignToReg<RC.GPR_64>>>,
+ CCIfSubtarget<"is32Bit()", CCIfType<[i64],
+ CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
+
+ // float, double, float128 --> XMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[f32, f64, f128],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // long double --> FP
+ CCIfType<[f80], CCAssignToReg<RC.FP_CALL>>,
+
+ // __m128, __m128i, __m128d --> XMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m256, __m256i, __m256d --> YMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>,
+
+ // __m512, __m512i, __m512d --> ZMM
+ // In the case of SSE disabled --> save to stack
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()",CCAssignToReg<RC.ZMM>>>,
+
+ // If no register was found -> assign to stack
+
+ // In 64 bit, assign 64/32 bit values to 8 byte stack
+ CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64],
+ CCAssignToStack<8, 8>>>,
+
+ // In 32 bit, assign 64/32 bit values to 8/4 byte stack
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+ CCIfType<[i64, f64], CCAssignToStack<8, 4>>,
+
+ // MMX type gets 8 byte slot in stack , while alignment depends on target
+ CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>,
+ CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+ // float 128 get stack slots whose size and alignment depends
+ // on the subtarget.
+ CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToStack<16, 16>>,
+
+ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCAssignToStack<32, 32>>,
+
+ // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+ CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>>
+]>;
+
+def RetCC_#NAME : CallingConv<[
+ // Promote i1, v8i1 arguments to i8.
+ CCIfType<[i1, v8i1], CCPromoteToType<i8>>,
+
+ // Promote v16i1 arguments to i16.
+ CCIfType<[v16i1], CCPromoteToType<i16>>,
+
+ // Promote v32i1 arguments to i32.
+ CCIfType<[v32i1], CCPromoteToType<i32>>,
+
+ // bool, char, int, enum, long, pointer --> GPR
+ CCIfType<[i8], CCAssignToReg<RC.GPR_8>>,
+ CCIfType<[i16], CCAssignToReg<RC.GPR_16>>,
+ CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
+
+ // long long, __int64 --> GPR
+ CCIfType<[i64], CCAssignToReg<RC.GPR_64>>,
+
+ // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32)
+ CCIfType<[v64i1], CCPromoteToType<i64>>,
+ CCIfSubtarget<"is64Bit()", CCIfType<[i64],
+ CCAssignToReg<RC.GPR_64>>>,
+ CCIfSubtarget<"is32Bit()", CCIfType<[i64],
+ CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
+
+ // long double --> FP
+ CCIfType<[f80], CCAssignToReg<RC.FP_RET>>,
+
+ // float, double, float128 --> XMM
+ CCIfType<[f32, f64, f128],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m128, __m128i, __m128d --> XMM
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+ // __m256, __m256i, __m256d --> YMM
+ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+ CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>,
+
+ // __m512, __m512i, __m512d --> ZMM
+ CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+ CCIfSubtarget<"hasAVX512()", CCAssignToReg<RC.ZMM>>>
+]>;
+}
+
//===----------------------------------------------------------------------===//
// Return Value Calling Conventions
//===----------------------------------------------------------------------===//
@@ -135,20 +308,12 @@ def RetCC_X86_32_HiPE : CallingConv<[
CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
]>;
-// X86-32 HiPE return-value convention.
+// X86-32 Vectorcall return-value convention.
def RetCC_X86_32_VectorCall : CallingConv<[
- // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
- CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, f128],
CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
- // 256-bit FP vectors
- CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
-
- // 512-bit FP vectors
- CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
- CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
-
// Return integers in the standard way.
CCDelegateTo<RetCC_X86Common>
]>;
@@ -177,6 +342,16 @@ def RetCC_X86_Win64_C : CallingConv<[
CCDelegateTo<RetCC_X86_64_C>
]>;
+// X86-64 vectorcall return-value convention.
+def RetCC_X86_64_Vectorcall : CallingConv<[
+ // Vectorcall calling convention always returns FP values in XMMs.
+ CCIfType<[f32, f64, f128],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Otherwise, everything is the same as Windows X86-64 C CC.
+ CCDelegateTo<RetCC_X86_Win64_C>
+]>;
+
// X86-64 HiPE return-value convention.
def RetCC_X86_64_HiPE : CallingConv<[
// Promote all types to i64
@@ -196,6 +371,9 @@ def RetCC_X86_64_WebKit_JS : CallingConv<[
]>;
def RetCC_X86_64_Swift : CallingConv<[
+
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
// For integers, ECX, R8D can be used as extra return registers.
CCIfType<[i1], CCPromoteToType<i8>>,
CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>,
@@ -234,6 +412,14 @@ def RetCC_X86_64_HHVM: CallingConv<[
RAX, R10, R11, R13, R14, R15]>>
]>;
+
+defm X86_32_RegCall :
+ X86_RegCall_base<RC_X86_32_RegCall>;
+defm X86_Win64_RegCall :
+ X86_RegCall_base<RC_X86_64_RegCall_Win>;
+defm X86_SysV64_RegCall :
+ X86_RegCall_base<RC_X86_64_RegCall_SysV>;
+
// This is the root return-value convention for the X86-32 backend.
def RetCC_X86_32 : CallingConv<[
// If FastCC, use RetCC_X86_32_Fast.
@@ -241,6 +427,7 @@ def RetCC_X86_32 : CallingConv<[
// If HiPE, use RetCC_X86_32_HiPE.
CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_32_RegCall>>,
// Otherwise, use RetCC_X86_32_C.
CCDelegateTo<RetCC_X86_32_C>
@@ -262,9 +449,17 @@ def RetCC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+ // Handle Vectorcall CC
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>,
+
// Handle HHVM calls.
CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
+ CCIfCC<"CallingConv::X86_RegCall",
+ CCIfSubtarget<"isTargetWin64()",
+ CCDelegateTo<RetCC_X86_Win64_RegCall>>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_SysV64_RegCall>>,
+
// Mingw64 and native Win64 use Win64 CC
CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
@@ -436,18 +631,7 @@ def CC_X86_Win64_C : CallingConv<[
]>;
def CC_X86_Win64_VectorCall : CallingConv<[
- // The first 6 floating point and vector types of 128 bits or less use
- // XMM0-XMM5.
- CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
-
- // 256-bit vectors use YMM registers.
- CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
-
- // 512-bit vectors use ZMM registers.
- CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
- CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+ CCCustom<"CC_X86_64_VectorCall">,
// Delegate to fastcall to handle integer types.
CCDelegateTo<CC_X86_Win64_C>
@@ -657,25 +841,9 @@ def CC_X86_32_FastCall : CallingConv<[
CCDelegateTo<CC_X86_32_Common>
]>;
-def CC_X86_32_VectorCall : CallingConv<[
- // The first 6 floating point and vector types of 128 bits or less use
- // XMM0-XMM5.
- CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
-
- // 256-bit vectors use YMM registers.
- CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
- CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
-
- // 512-bit vectors use ZMM registers.
- CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
- CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
-
- // Otherwise, pass it indirectly.
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
- v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
- v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
- CCCustom<"CC_X86_32_VectorCallIndirect">>,
+def CC_X86_Win32_VectorCall : CallingConv<[
+ // Pass floating point in XMMs
+ CCCustom<"CC_X86_32_VectorCall">,
// Delegate to fastcall to handle integer types.
CCDelegateTo<CC_X86_32_FastCall>
@@ -809,11 +977,12 @@ def CC_X86_32 : CallingConv<[
CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
- CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
+ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
// Otherwise, drop to normal X86-32 CC
CCDelegateTo<CC_X86_32_C>
@@ -830,6 +999,9 @@ def CC_X86_64 : CallingConv<[
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>,
+ CCIfCC<"CallingConv::X86_RegCall",
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>,
+ CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>,
CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
// Mingw64 and native Win64 use Win64 CC
@@ -860,7 +1032,9 @@ def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>;
def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
-def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
+def CSR_Win64_NoSSE : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add CSR_Win64_NoSSE,
(sequence "XMM%u", 6, 15))>;
// The function used by Darwin to obtain the address of a thread-local variable
@@ -931,3 +1105,17 @@ def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
// Only R12 is preserved for PHP calls in HHVM.
def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
+
+// Register calling convention preserves few GPR and XMM8-15
+def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>;
+def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE,
+ (sequence "XMM%u", 4, 7))>;
+def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+ (sequence "R%u", 10, 15))>;
+def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE,
+ (sequence "XMM%u", 8, 15))>;
+def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+ (sequence "R%u", 12, 15))>;
+def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,
+ (sequence "XMM%u", 8, 15))>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
new file mode 100755
index 0000000..bdd1ab5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -0,0 +1,213 @@
+//===----------------------- X86EvexToVex.cpp ----------------------------===//
+// Compress EVEX instructions to VEX encoding when possible to reduce code size
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+/// \file
+/// This file defines the pass that goes over all AVX-512 instructions which
+/// are encoded using the EVEX prefix and if possible replaces them by their
+/// corresponding VEX encoding which is usually shorter by 2 bytes.
+/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
+/// instruction has a corresponding AVX/AVX2 opcode and when it does not
+/// use the xmm or the mask registers or xmm/ymm registers wuith indexes
+/// higher than 15.
+/// The pass applies code reduction on the generated code for AVX-512 instrs.
+///
+//===---------------------------------------------------------------------===//
+
+#include "InstPrinter/X86InstComments.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86InstrTablesInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
+#define EVEX2VEX_NAME "x86-evex-to-vex-compress"
+
+#define DEBUG_TYPE EVEX2VEX_NAME
+
+namespace {
+
+class EvexToVexInstPass : public MachineFunctionPass {
+
+ /// X86EvexToVexCompressTable - Evex to Vex encoding opcode map.
+ typedef DenseMap<unsigned, uint16_t> EvexToVexTableType;
+ EvexToVexTableType EvexToVex128Table;
+ EvexToVexTableType EvexToVex256Table;
+
+ /// For EVEX instructions that can be encoded using VEX encoding, replace
+ /// them by the VEX encoding in order to reduce size.
+ bool CompressEvexToVexImpl(MachineInstr &MI) const;
+
+ /// For initializing the hash map tables of all AVX-512 EVEX
+ /// corresponding to AVX/AVX2 opcodes.
+ void AddTableEntry(EvexToVexTableType &EvexToVexTable, uint16_t EvexOp,
+ uint16_t VexOp);
+
+public:
+ static char ID;
+
+ StringRef getPassName() const override { return EVEX2VEX_DESC; }
+
+ EvexToVexInstPass() : MachineFunctionPass(ID) {
+ initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
+
+ // Initialize the EVEX to VEX 128 table map.
+ for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex128CompressTable) {
+ AddTableEntry(EvexToVex128Table, Entry.EvexOpcode, Entry.VexOpcode);
+ }
+
+ // Initialize the EVEX to VEX 256 table map.
+ for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex256CompressTable) {
+ AddTableEntry(EvexToVex256Table, Entry.EvexOpcode, Entry.VexOpcode);
+ }
+ }
+
+ /// Loop over all of the basic blocks, replacing EVEX instructions
+ /// by equivalent VEX instructions when possible for reducing code size.
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ // This pass runs after regalloc and doesn't support VReg operands.
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ /// Machine instruction info used throughout the class.
+ const X86InstrInfo *TII;
+};
+
+char EvexToVexInstPass::ID = 0;
+}
+
+INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+
+FunctionPass *llvm::createX86EvexToVexInsts() {
+ return new EvexToVexInstPass();
+}
+
+bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ if (!ST.hasAVX512())
+ return false;
+
+ bool Changed = false;
+
+ /// Go over all basic blocks in function and replace
+ /// EVEX encoded instrs by VEX encoding when possible.
+ for (MachineBasicBlock &MBB : MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
+ Changed |= CompressEvexToVexImpl(MI);
+ }
+
+ return Changed;
+}
+
+void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
+ uint16_t EvexOp, uint16_t VexOp) {
+ EvexToVexTable[EvexOp] = VexOp;
+}
+
+// For EVEX instructions that can be encoded using VEX encoding
+// replace them by the VEX encoding in order to reduce size.
+bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
+
+ // VEX format.
+ // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1
+ // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM]
+ //
+ // EVEX format.
+ // # of bytes: 4 1 1 1 4 / 1 1
+ // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate]
+
+ const MCInstrDesc &Desc = MI.getDesc();
+
+ // Check for EVEX instructions only.
+ if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX)
+ return false;
+
+ // Check for EVEX instructions with mask or broadcast as in these cases
+ // the EVEX prefix is needed in order to carry this information
+ // thus preventing the transformation to VEX encoding.
+ if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
+ return false;
+
+ // Check for non EVEX_V512 instrs only.
+ // EVEX_V512 instr: bit EVEX_L2 = 1; bit VEX_L = 0.
+ if ((Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L))
+ return false;
+
+ // EVEX_V128 instr: bit EVEX_L2 = 0, bit VEX_L = 0.
+ bool IsEVEX_V128 =
+ (!(Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L));
+
+ // EVEX_V256 instr: bit EVEX_L2 = 0, bit VEX_L = 1.
+ bool IsEVEX_V256 =
+ (!(Desc.TSFlags & X86II::EVEX_L2) && (Desc.TSFlags & X86II::VEX_L));
+
+ unsigned NewOpc = 0;
+
+ // Check for EVEX_V256 instructions.
+ if (IsEVEX_V256) {
+ // Search for opcode in the EvexToVex256 table.
+ auto It = EvexToVex256Table.find(MI.getOpcode());
+ if (It != EvexToVex256Table.end())
+ NewOpc = It->second;
+ }
+
+ // Check for EVEX_V128 or Scalar instructions.
+ else if (IsEVEX_V128) {
+ // Search for opcode in the EvexToVex128 table.
+ auto It = EvexToVex128Table.find(MI.getOpcode());
+ if (It != EvexToVex128Table.end())
+ NewOpc = It->second;
+ }
+
+ if (!NewOpc)
+ return false;
+
+ auto isHiRegIdx = [](unsigned Reg) {
+ // Check for XMM register with indexes between 16 - 31.
+ if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
+ return true;
+
+ // Check for YMM register with indexes between 16 - 31.
+ if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
+ return true;
+
+ return false;
+ };
+
+ // Check that operands are not ZMM regs or
+ // XMM/YMM regs with hi indexes between 16 - 31.
+ for (const MachineOperand &MO : MI.explicit_operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+
+ assert (!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31));
+
+ if (isHiRegIdx(Reg))
+ return false;
+ }
+
+ const MCInstrDesc &MCID = TII->get(NewOpc);
+ MI.setDesc(MCID);
+ MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 093fed7..985acf9 100644
--- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -51,10 +51,10 @@ public:
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::AllVRegsAllocated);
+ MachineFunctionProperties::Property::NoVRegs);
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "X86 pseudo instruction expansion pass";
}
@@ -94,7 +94,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
// Incoporate the retaddr area.
- Offset = StackAdj-MaxTCDelta;
+ Offset = StackAdj - MaxTCDelta;
assert(Offset >= 0 && "Offset should never be negative");
if (Offset) {
@@ -106,14 +106,22 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Jump to label or value in register.
bool IsWin64 = STI->isTargetWin64();
if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) {
- unsigned Op = (Opcode == X86::TCRETURNdi)
- ? X86::TAILJMPd
- : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64);
+ unsigned Op;
+ switch (Opcode) {
+ case X86::TCRETURNdi:
+ Op = X86::TAILJMPd;
+ break;
+ default:
+ // Note: Win64 uses REX prefixes indirect jumps out of functions, but
+ // not direct ones.
+ Op = X86::TAILJMPd64;
+ break;
+ }
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
- if (JumpTarget.isGlobal())
+ if (JumpTarget.isGlobal()) {
MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
JumpTarget.getTargetFlags());
- else {
+ } else {
assert(JumpTarget.isSymbol());
MIB.addExternalSymbol(JumpTarget.getSymbolName(),
JumpTarget.getTargetFlags());
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
index dfe3c80..c890fdd 100644
--- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -170,6 +170,12 @@ private:
const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
X86AddressMode &AM);
+
+ unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC, unsigned Op0,
+ bool Op0IsKill, unsigned Op1, bool Op1IsKill,
+ unsigned Op2, bool Op2IsKill, unsigned Op3,
+ bool Op3IsKill);
};
} // end anonymous namespace.
@@ -182,18 +188,18 @@ getX86ConditionCode(CmpInst::Predicate Predicate) {
default: break;
// Floating-point Predicates
case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
- case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
- case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
- case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
- case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
- case CmpInst::FCMP_OEQ: // fall-through
+ case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH;
case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
// Integer Predicates
@@ -229,15 +235,15 @@ getX86SSEConditionCode(CmpInst::Predicate Predicate) {
switch (Predicate) {
default: llvm_unreachable("Unexpected predicate");
case CmpInst::FCMP_OEQ: CC = 0; break;
- case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_OLT: CC = 1; break;
- case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_OLE: CC = 2; break;
case CmpInst::FCMP_UNO: CC = 3; break;
case CmpInst::FCMP_UNE: CC = 4; break;
- case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_UGE: CC = 5; break;
- case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
+ case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_UGT: CC = 6; break;
case CmpInst::FCMP_ORD: CC = 7; break;
case CmpInst::FCMP_UEQ:
@@ -351,6 +357,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
bool HasSSE41 = Subtarget->hasSSE41();
bool HasAVX = Subtarget->hasAVX();
bool HasAVX2 = Subtarget->hasAVX2();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasVLX = Subtarget->hasVLX();
bool IsNonTemporal = MMO && MMO->isNonTemporal();
// Get opcode and regclass of the output for the given load instruction.
@@ -378,7 +386,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
break;
case MVT::f32:
if (X86ScalarSSEf32) {
- Opc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
+ Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
RC = &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp32m;
@@ -387,7 +395,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
break;
case MVT::f64:
if (X86ScalarSSEf64) {
- Opc = HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
+ Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
RC = &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp64m;
@@ -399,20 +407,26 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
return false;
case MVT::v4f32:
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
- Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
else if (Alignment >= 16)
- Opc = HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
+ Opc = HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
else
- Opc = HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
+ Opc = HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
RC = &X86::VR128RegClass;
break;
case MVT::v2f64:
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
- Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
else if (Alignment >= 16)
- Opc = HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
+ Opc = HasVLX ? X86::VMOVAPDZ128rm :
+ HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
else
- Opc = HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
+ Opc = HasVLX ? X86::VMOVUPDZ128rm :
+ HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
RC = &X86::VR128RegClass;
break;
case MVT::v4i32:
@@ -420,27 +434,34 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
case MVT::v8i16:
case MVT::v16i8:
if (IsNonTemporal && Alignment >= 16)
- Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
else if (Alignment >= 16)
- Opc = HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
+ Opc = HasVLX ? X86::VMOVDQA64Z128rm :
+ HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
else
- Opc = HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
+ Opc = HasVLX ? X86::VMOVDQU64Z128rm :
+ HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
RC = &X86::VR128RegClass;
break;
case MVT::v8f32:
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
- Opc = X86::VMOVNTDQAYrm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
else
- Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm;
+ Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
RC = &X86::VR256RegClass;
break;
case MVT::v4f64:
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
Opc = X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
else
- Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm;
+ Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
RC = &X86::VR256RegClass;
break;
case MVT::v8i32:
@@ -450,12 +471,14 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
Opc = X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
else
- Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm;
+ Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
RC = &X86::VR256RegClass;
break;
case MVT::v16f32:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
if (IsNonTemporal && Alignment >= 64)
Opc = X86::VMOVNTDQAZrm;
else
@@ -463,7 +486,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
RC = &X86::VR512RegClass;
break;
case MVT::v8f64:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
if (IsNonTemporal && Alignment >= 64)
Opc = X86::VMOVNTDQAZrm;
else
@@ -474,7 +497,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
// Note: There are a lot more choices based on type with AVX-512, but
// there's really no advantage when the load isn't masked.
if (IsNonTemporal && Alignment >= 64)
@@ -504,6 +527,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
bool HasSSE2 = Subtarget->hasSSE2();
bool HasSSE4A = Subtarget->hasSSE4A();
bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasVLX = Subtarget->hasVLX();
bool IsNonTemporal = MMO && MMO->isNonTemporal();
// Get opcode and regclass of the output for the given store instruction.
@@ -518,8 +543,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
TII.get(X86::AND8ri), AndResult)
.addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
ValReg = AndResult;
+ LLVM_FALLTHROUGH; // handle i1 as i8.
}
- // FALLTHROUGH, handling i1 as i8.
case MVT::i8: Opc = X86::MOV8mr; break;
case MVT::i16: Opc = X86::MOV16mr; break;
case MVT::i32:
@@ -534,7 +559,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
if (IsNonTemporal && HasSSE4A)
Opc = X86::MOVNTSS;
else
- Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ Opc = HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
} else
Opc = X86::ST_Fp32m;
break;
@@ -543,27 +569,34 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
if (IsNonTemporal && HasSSE4A)
Opc = X86::MOVNTSD;
else
- Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+ Opc = HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
} else
Opc = X86::ST_Fp64m;
break;
case MVT::v4f32:
if (Aligned) {
if (IsNonTemporal)
- Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+ Opc = HasVLX ? X86::VMOVNTPSZ128mr :
+ HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
else
- Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ Opc = HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
} else
- Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
+ Opc = HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
break;
case MVT::v2f64:
if (Aligned) {
if (IsNonTemporal)
- Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+ Opc = HasVLX ? X86::VMOVNTPDZ128mr :
+ HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
else
- Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+ Opc = HasVLX ? X86::VMOVAPDZ128mr :
+ HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
} else
- Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
+ Opc = HasVLX ? X86::VMOVUPDZ128mr :
+ HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
break;
case MVT::v4i32:
case MVT::v2i64:
@@ -571,45 +604,57 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
case MVT::v16i8:
if (Aligned) {
if (IsNonTemporal)
- Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+ Opc = HasVLX ? X86::VMOVNTDQZ128mr :
+ HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
else
- Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+ Opc = HasVLX ? X86::VMOVDQA64Z128mr :
+ HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
} else
- Opc = HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
+ Opc = HasVLX ? X86::VMOVDQU64Z128mr :
+ HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
break;
case MVT::v8f32:
assert(HasAVX);
- if (Aligned)
- Opc = IsNonTemporal ? X86::VMOVNTPSYmr : X86::VMOVAPSYmr;
- else
- Opc = X86::VMOVUPSYmr;
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
break;
case MVT::v4f64:
assert(HasAVX);
if (Aligned) {
- Opc = IsNonTemporal ? X86::VMOVNTPDYmr : X86::VMOVAPDYmr;
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
} else
- Opc = X86::VMOVUPDYmr;
+ Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
break;
case MVT::v8i32:
case MVT::v4i64:
case MVT::v16i16:
case MVT::v32i8:
assert(HasAVX);
- if (Aligned)
- Opc = IsNonTemporal ? X86::VMOVNTDQYmr : X86::VMOVDQAYmr;
- else
- Opc = X86::VMOVDQUYmr;
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
+ else
+ Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
break;
case MVT::v16f32:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
if (Aligned)
Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
else
Opc = X86::VMOVUPSZmr;
break;
case MVT::v8f64:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
if (Aligned) {
Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
} else
@@ -619,7 +664,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
// Note: There are a lot more choices based on type with AVX-512, but
// there's really no advantage when the store isn't masked.
if (Aligned)
@@ -659,7 +704,9 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
bool Signed = true;
switch (VT.getSimpleVT().SimpleTy) {
default: break;
- case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8.
+ case MVT::i1:
+ Signed = false;
+ LLVM_FALLTHROUGH; // Handle as i8.
case MVT::i8: Opc = X86::MOV8mi; break;
case MVT::i16: Opc = X86::MOV16mi; break;
case MVT::i32: Opc = X86::MOV32mi; break;
@@ -895,7 +942,7 @@ redo_gep:
for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
i != e; ++i, ++GTI) {
const Value *Op = *i;
- if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
const StructLayout *SL = DL.getStructLayout(STy);
Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
continue;
@@ -1454,11 +1501,11 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
}
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
- static unsigned SETFOpcTable[2][3] = {
+ static const uint16_t SETFOpcTable[2][3] = {
{ X86::SETEr, X86::SETNPr, X86::AND8rr },
{ X86::SETNEr, X86::SETPr, X86::OR8rr }
};
- unsigned *SETFOpc = nullptr;
+ const uint16_t *SETFOpc = nullptr;
switch (Predicate) {
default: break;
case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
@@ -1511,7 +1558,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
// Handle zero-extension from i1 to i8, which is common.
MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
- if (SrcVT.SimpleTy == MVT::i1) {
+ if (SrcVT == MVT::i1) {
// Set the high bits to zero.
ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
SrcVT = MVT::i8;
@@ -1601,7 +1648,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
switch (Predicate) {
default: break;
case CmpInst::FCMP_OEQ:
- std::swap(TrueMBB, FalseMBB); // fall-through
+ std::swap(TrueMBB, FalseMBB);
+ LLVM_FALLTHROUGH;
case CmpInst::FCMP_UNE:
NeedExtraBranch = true;
Predicate = CmpInst::FCMP_ONE;
@@ -1651,6 +1699,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
if (TestOpc) {
unsigned OpReg = getRegForValue(TI->getOperand(0));
if (OpReg == 0) return false;
+
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
.addReg(OpReg).addImm(1);
@@ -1688,8 +1737,17 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
unsigned OpReg = getRegForValue(BI->getCondition());
if (OpReg == 0) return false;
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
+ unsigned KOpReg = OpReg;
+ OpReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), OpReg)
+ .addReg(KOpReg);
+ }
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(OpReg).addImm(1);
+ .addReg(OpReg)
+ .addImm(1);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
.addMBB(TrueMBB);
finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
@@ -1875,15 +1933,15 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
// Copy the zero into the appropriate sub/super/identical physical
// register. Unfortunately the operations needed are not uniform enough
// to fit neatly into the table above.
- if (VT.SimpleTy == MVT::i16) {
+ if (VT == MVT::i16) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Copy), TypeEntry.HighInReg)
.addReg(Zero32, 0, X86::sub_16bit);
- } else if (VT.SimpleTy == MVT::i32) {
+ } else if (VT == MVT::i32) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Copy), TypeEntry.HighInReg)
.addReg(Zero32);
- } else if (VT.SimpleTy == MVT::i64) {
+ } else if (VT == MVT::i64) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
.addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
@@ -1953,11 +2011,11 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
- static unsigned SETFOpcTable[2][3] = {
+ static const uint16_t SETFOpcTable[2][3] = {
{ X86::SETNPr, X86::SETEr , X86::TEST8rr },
{ X86::SETPr, X86::SETNEr, X86::OR8rr }
};
- unsigned *SETFOpc = nullptr;
+ const uint16_t *SETFOpc = nullptr;
switch (Predicate) {
default: break;
case CmpInst::FCMP_OEQ:
@@ -2023,8 +2081,17 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
return false;
bool CondIsKill = hasTrivialKill(Cond);
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
+ unsigned KCondReg = CondReg;
+ CondReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CondReg)
+ .addReg(KCondReg, getKillRegState(CondIsKill));
+ }
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+ .addReg(CondReg, getKillRegState(CondIsKill))
+ .addImm(1);
}
const Value *LHS = I->getOperand(1);
@@ -2087,12 +2154,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
std::swap(CmpLHS, CmpRHS);
// Choose the SSE instruction sequence based on data type (float or double).
- static unsigned OpcTable[2][4] = {
- { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr },
- { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }
+ static const uint16_t OpcTable[2][4] = {
+ { X86::CMPSSrr, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },
+ { X86::CMPSDrr, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }
};
- unsigned *Opc = nullptr;
+ const uint16_t *Opc = nullptr;
switch (RetVT.SimpleTy) {
default: return false;
case MVT::f32: Opc = &OpcTable[0][0]; break;
@@ -2119,9 +2186,36 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
unsigned ResultReg;
-
- if (Subtarget->hasAVX()) {
- const TargetRegisterClass *FR32 = &X86::FR32RegClass;
+
+ if (Subtarget->hasAVX512()) {
+ // If we have AVX512 we can use a mask compare and masked movss/sd.
+ const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
+ const TargetRegisterClass *VK1 = &X86::VK1RegClass;
+
+ unsigned CmpOpcode =
+ (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
+ unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
+ CmpRHSReg, CmpRHSIsKill, CC);
+
+ // Need an IMPLICIT_DEF for the input that is used to generate the upper
+ // bits of the result register since its not based on any of the inputs.
+ unsigned ImplicitDefReg = createResultReg(VR128X);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+
+ // Place RHSReg is the passthru of the masked movss/sd operation and put
+ // LHS in the input. The mask input comes from the compare.
+ unsigned MovOpcode =
+ (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
+ unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill,
+ CmpReg, true, ImplicitDefReg, true,
+ LHSReg, LHSIsKill);
+
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);
+
+ } else if (Subtarget->hasAVX()) {
const TargetRegisterClass *VR128 = &X86::VR128RegClass;
// If we have AVX, create 1 blendv instead of 3 logic instructions.
@@ -2130,11 +2224,11 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
// instructions as the AND/ANDN/OR sequence due to register moves, so
// don't bother.
unsigned CmpOpcode =
- (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
+ (RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
unsigned BlendOpcode =
- (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
-
- unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill,
+ (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+
+ unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
LHSReg, LHSIsKill, CmpReg, true);
@@ -2142,14 +2236,18 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
} else {
+ const TargetRegisterClass *VR128 = &X86::VR128RegClass;
unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
- unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+ unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
LHSReg, LHSIsKill);
- unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+ unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
RHSReg, RHSIsKill);
- ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
- AndReg, /*IsKill=*/true);
+ unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
+ AndReg, /*IsKill=*/true);
+ ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
}
updateValueMap(I, ResultReg);
return true;
@@ -2195,8 +2293,18 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
if (CondReg == 0)
return false;
bool CondIsKill = hasTrivialKill(Cond);
+
+ // In case OpReg is a K register, COPY to a GPR
+ if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
+ unsigned KCondReg = CondReg;
+ CondReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), CondReg)
+ .addReg(KCondReg, getKillRegState(CondIsKill));
+ }
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
+ .addReg(CondReg, getKillRegState(CondIsKill))
+ .addImm(1);
}
const Value *LHS = I->getOperand(1);
@@ -2522,8 +2630,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// This needs to be set before we call getPtrSizedFrameRegister, otherwise
// we get the wrong frame register.
- MachineFrameInfo *MFI = MF->getFrameInfo();
- MFI->setFrameAddressIsTaken(true);
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ MFI.setFrameAddressIsTaken(true);
const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
@@ -2698,7 +2806,9 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
const Function *Callee = II->getCalledFunction();
auto *Ty = cast<StructType>(Callee->getReturnType());
Type *RetTy = Ty->getTypeAtIndex(0U);
- Type *CondTy = Ty->getTypeAtIndex(1);
+ assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
+ Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
+ "Overflow value expected to be an i1");
MVT VT;
if (!isTypeLegal(RetTy, VT))
@@ -2808,7 +2918,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
if (!ResultReg)
return false;
- unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
+ // Assign to a GPR since the overflow return value is lowered to a SETcc.
+ unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
ResultReg2);
@@ -2966,7 +3077,7 @@ bool X86FastISel::fastLowerArguments() {
default: llvm_unreachable("Unexpected value type.");
case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
- case MVT::f32: // fall-through
+ case MVT::f32: LLVM_FALLTHROUGH;
case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
}
unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
@@ -3140,7 +3251,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
"Unexpected extend");
- if (ArgVT.SimpleTy == MVT::i1)
+ if (ArgVT == MVT::i1)
return false;
bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
@@ -3154,7 +3265,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
"Unexpected extend");
// Handle zero-extension from i1 to i8, which is common.
- if (ArgVT.SimpleTy == MVT::i1) {
+ if (ArgVT == MVT::i1) {
// Set the high bits to zero.
ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
ArgVT = MVT::i8;
@@ -3456,8 +3567,14 @@ X86FastISel::fastSelectInstruction(const Instruction *I) {
if (!SrcVT.isSimple() || !DstVT.isSimple())
return false;
- if (!SrcVT.is128BitVector() &&
- !(Subtarget->hasAVX() && SrcVT.is256BitVector()))
+ MVT SVT = SrcVT.getSimpleVT();
+ MVT DVT = DstVT.getSimpleVT();
+
+ if (!SVT.is128BitVector() &&
+ !(Subtarget->hasAVX() && SVT.is256BitVector()) &&
+ !(Subtarget->hasAVX512() && SVT.is512BitVector() &&
+ (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 &&
+ DVT.getScalarSizeInBits() >= 32))))
return false;
unsigned Reg = getRegForValue(I->getOperand(0));
@@ -3505,7 +3622,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
unsigned Opc = 0;
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected value type");
- case MVT::i1: VT = MVT::i8; // fall-through
+ case MVT::i1: VT = MVT::i8; LLVM_FALLTHROUGH;
case MVT::i8: Opc = X86::MOV8ri; break;
case MVT::i16: Opc = X86::MOV16ri; break;
case MVT::i32: Opc = X86::MOV32ri; break;
@@ -3775,6 +3892,38 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
return true;
}
+unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
+ const TargetRegisterClass *RC,
+ unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill,
+ unsigned Op2, bool Op2IsKill,
+ unsigned Op3, bool Op3IsKill) {
+ const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+ unsigned ResultReg = createResultReg(RC);
+ Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+ Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+ Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
+ Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 3);
+
+ if (II.getNumDefs() >= 1)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op1, getKillRegState(Op1IsKill))
+ .addReg(Op2, getKillRegState(Op2IsKill))
+ .addReg(Op3, getKillRegState(Op3IsKill));
+ else {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addReg(Op1, getKillRegState(Op1IsKill))
+ .addReg(Op2, getKillRegState(Op2IsKill))
+ .addReg(Op3, getKillRegState(Op3IsKill));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+ }
+ return ResultReg;
+}
+
namespace llvm {
FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
diff --git a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 90e758d..8bde4bf 100644
--- a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -66,8 +66,6 @@ using namespace llvm;
#define DEBUG_TYPE FIXUPBW_NAME
// Option to allow this optimization pass to have fine-grained control.
-// This is turned off by default so as not to affect a large number of
-// existing lit tests.
static cl::opt<bool>
FixupBWInsts("fixup-byte-word-insts",
cl::desc("Change byte and word instructions to larger sizes"),
@@ -104,9 +102,7 @@ class FixupBWInstPass : public MachineFunctionPass {
public:
static char ID;
- const char *getPassName() const override {
- return FIXUPBW_DESC;
- }
+ StringRef getPassName() const override { return FIXUPBW_DESC; }
FixupBWInstPass() : MachineFunctionPass(ID) {
initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry());
@@ -125,7 +121,7 @@ public:
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::AllVRegsAllocated);
+ MachineFunctionProperties::Property::NoVRegs);
}
private:
@@ -158,7 +154,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
OptForSize = MF.getFunction()->optForSize();
MLI = &getAnalysis<MachineLoopInfo>();
- LiveRegs.init(&TII->getRegisterInfo());
+ LiveRegs.init(TII->getRegisterInfo());
DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 013ee24..1209591 100644
--- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -40,7 +40,7 @@ class FixupLEAPass : public MachineFunctionPass {
/// where appropriate.
bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
- const char *getPassName() const override { return "X86 LEA Fixup"; }
+ StringRef getPassName() const override { return "X86 LEA Fixup"; }
/// \brief Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
@@ -95,7 +95,7 @@ public:
// This pass runs after regalloc and doesn't support VReg operands.
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::AllVRegsAllocated);
+ MachineFunctionProperties::Property::NoVRegs);
}
private:
diff --git a/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp
index fb317da..a86eb99 100644
--- a/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -39,7 +39,7 @@ class X86FixupSetCCPass : public MachineFunctionPass {
public:
X86FixupSetCCPass() : MachineFunctionPass(ID) {}
- const char *getPassName() const override { return "X86 Fixup SetCC"; }
+ StringRef getPassName() const override { return "X86 Fixup SetCC"; }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -99,7 +99,8 @@ bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) {
MachineInstr *
X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
MachineBasicBlock::reverse_iterator MI) {
- auto MBBStart = MBB->instr_rend();
+ // FIXME: Should this be instr_rend(), and MI be reverse_instr_iterator?
+ auto MBBStart = MBB->rend();
for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)
for (auto &Op : MI->implicit_operands())
if ((Op.getReg() == X86::EFLAGS) && (Op.isDef()))
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 55c1bff..a5489b9 100644
--- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -78,10 +78,10 @@ namespace {
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::AllVRegsAllocated);
+ MachineFunctionProperties::Property::NoVRegs);
}
- const char *getPassName() const override { return "X86 FP Stackifier"; }
+ StringRef getPassName() const override { return "X86 FP Stackifier"; }
private:
const TargetInstrInfo *TII; // Machine instruction info.
@@ -206,6 +206,13 @@ namespace {
RegMap[Reg] = StackTop++;
}
+ // popReg - Pop a register from the stack.
+ void popReg() {
+ if (StackTop == 0)
+ report_fatal_error("Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+ }
+
bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
@@ -326,9 +333,28 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
// Process the function in depth first order so that we process at least one
// of the predecessors for every reachable block in the function.
- SmallPtrSet<MachineBasicBlock*, 8> Processed;
+ df_iterator_default_set<MachineBasicBlock*> Processed;
MachineBasicBlock *Entry = &MF.front();
+ LiveBundle &Bundle =
+ LiveBundles[Bundles->getBundle(Entry->getNumber(), false)];
+
+ // In regcall convention, some FP registers may not be passed through
+ // the stack, so they will need to be assigned to the stack first
+ if ((Entry->getParent()->getFunction()->getCallingConv() ==
+ CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) {
+ // In the register calling convention, up to one FP argument could be
+ // saved in the first FP register.
+ // If bundle.mask is non-zero and Bundle.FixCount is zero, it means
+ // that the FP registers contain arguments.
+ // The actual value is passed in FP0.
+ // Here we fix the stack and mark FP0 as pre-assigned register.
+ assert((Bundle.Mask & 0xFE) == 0 &&
+ "Only FP0 could be passed as an argument");
+ Bundle.FixCount = 1;
+ Bundle.FixStack[0] = 0;
+ }
+
bool Changed = false;
for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
Changed |= processBasicBlock(MF, *BB);
@@ -791,9 +817,8 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
MachineInstr &MI = *I;
const DebugLoc &dl = MI.getDebugLoc();
ASSERT_SORTED(PopTable);
- if (StackTop == 0)
- report_fatal_error("Cannot pop empty stack!");
- RegMap[Stack[--StackTop]] = ~0; // Update state
+
+ popReg();
// Check to see if there is a popping version of this instruction...
int Opcode = Lookup(PopTable, I->getOpcode());
@@ -929,6 +954,7 @@ void FPS::shuffleStackTop(const unsigned char *FixStack,
void FPS::handleCall(MachineBasicBlock::iterator &I) {
unsigned STReturns = 0;
+ const MachineFunction* MF = I->getParent()->getParent();
for (const auto &MO : I->operands()) {
if (!MO.isReg())
@@ -937,7 +963,10 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
unsigned R = MO.getReg() - X86::FP0;
if (R < 8) {
- assert(MO.isDef() && MO.isImplicit());
+ if (MF->getFunction()->getCallingConv() != CallingConv::X86_RegCall) {
+ assert(MO.isDef() && MO.isImplicit());
+ }
+
STReturns |= 1 << R;
}
}
@@ -945,9 +974,15 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
unsigned N = countTrailingOnes(STReturns);
// FP registers used for function return must be consecutive starting at
- // FP0.
+ // FP0
assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2));
+ // Reset the FP Stack - It is required because of possible leftovers from
+ // passed arguments. The caller should assume that the FP stack is
+ // returned empty (unless the callee returns values on FP stack).
+ while (StackTop > 0)
+ popReg();
+
for (unsigned I = 0; I < N; ++I)
pushReg(N - I - 1);
}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
index 03d9256..cd69044 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -50,7 +50,7 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
}
bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
- return !MF.getFrameInfo()->hasVarSizedObjects() &&
+ return !MF.getFrameInfo().hasVarSizedObjects() &&
!MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
}
@@ -74,7 +74,7 @@ X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
// when there are no stack objects.
bool
X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
- return MF.getFrameInfo()->hasStackObjects() ||
+ return MF.getFrameInfo().hasStackObjects() ||
MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
}
@@ -82,17 +82,15 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
/// pointer register. This is true if the function has variable sized allocas
/// or if frame pointer elimination is disabled.
bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const MachineModuleInfo &MMI = MF.getMMI();
-
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
TRI->needsStackRealignment(MF) ||
- MFI->hasVarSizedObjects() ||
- MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
+ MFI.hasVarSizedObjects() ||
+ MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
- MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() ||
- MFI->hasStackMap() || MFI->hasPatchPoint() ||
- MFI->hasCopyImplyingStackAdjustment());
+ MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
+ MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ MFI.hasCopyImplyingStackAdjustment());
}
static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
@@ -151,13 +149,15 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
bool Is64Bit) {
const MachineFunction *MF = MBB.getParent();
const Function *F = MF->getFunction();
- if (!F || MF->getMMI().callsEHReturn())
+ if (!F || MF->callsEHReturn())
return 0;
const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
- unsigned Opc = MBBI->getOpcode();
- switch (Opc) {
+ if (MBBI == MBB.end())
+ return 0;
+
+ switch (MBBI->getOpcode()) {
default: return 0;
case TargetOpcode::PATCHABLE_RET:
case X86::RET:
@@ -373,6 +373,10 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
: std::next(MBBI);
+ PI = skipDebugInstructionsBackward(PI, MBB.begin());
+ if (NI != nullptr)
+ NI = skipDebugInstructionsForward(NI, MBB.end());
+
unsigned Opc = PI->getOpcode();
int Offset = 0;
@@ -416,7 +420,7 @@ void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
const DebugLoc &DL,
const MCCFIInstruction &CFIInst) const {
MachineFunction &MF = *MBB.getParent();
- unsigned CFIIndex = MF.getMMI().addFrameInst(CFIInst);
+ unsigned CFIIndex = MF.addFrameInst(CFIInst);
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
}
@@ -425,18 +429,18 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL) const {
MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
MachineModuleInfo &MMI = MF.getMMI();
const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
// Add callee saved registers to move list.
- const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty()) return;
// Calculate offsets.
for (std::vector<CalleeSavedInfo>::const_iterator
I = CSI.begin(), E = CSI.end(); I != E; ++I) {
- int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+ int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
unsigned Reg = I->getReg();
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
@@ -445,20 +449,19 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
}
}
-MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL,
- bool InProlog) const {
+void X86FrameLowering::emitStackProbe(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const {
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
if (STI.isTargetWindowsCoreCLR()) {
if (InProlog) {
- return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+ emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
} else {
- return emitStackProbeInline(MF, MBB, MBBI, DL, false);
+ emitStackProbeInline(MF, MBB, MBBI, DL, false);
}
} else {
- return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+ emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
}
}
@@ -479,17 +482,19 @@ void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
assert(!ChkStkStub->isBundled() &&
"Not expecting bundled instructions here");
MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
- assert(std::prev(MBBI).operator==(ChkStkStub) &&
- "MBBI expected after __chkstk_stub.");
+ assert(std::prev(MBBI) == ChkStkStub &&
+ "MBBI expected after __chkstk_stub.");
DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
ChkStkStub->eraseFromParent();
}
}
-MachineInstr *X86FrameLowering::emitStackProbeInline(
- MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const {
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
assert(STI.is64Bit() && "different expansion needed for 32 bit");
assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
@@ -612,7 +617,7 @@ MachineInstr *X86FrameLowering::emitStackProbeInline(
// lowest touched page on the stack, not the point at which the OS
// will cause an overflow exception, so this is just an optimization
// to avoid unnecessarily touching pages that are below the current
- // SP but already commited to the stack by the OS.
+ // SP but already committed to the stack by the OS.
BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
.addReg(0)
.addImm(1)
@@ -699,13 +704,13 @@ MachineInstr *X86FrameLowering::emitStackProbeInline(
}
// Possible TODO: physreg liveness for InProlog case.
-
- return &*ContinueMBBI;
}
-MachineInstr *X86FrameLowering::emitStackProbeCall(
- MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const {
bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
unsigned CallOp;
@@ -763,11 +768,9 @@ MachineInstr *X86FrameLowering::emitStackProbeCall(
for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
}
-
- return &*MBBI;
}
-MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
+void X86FrameLowering::emitStackProbeInlineStub(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
@@ -775,8 +778,6 @@ MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
.addExternalSymbol("__chkstk_stub");
-
- return &*MBBI;
}
static unsigned calculateSetFPREG(uint64_t SPAdjust) {
@@ -793,11 +794,11 @@ static unsigned calculateSetFPREG(uint64_t SPAdjust) {
// have a call out. Otherwise just make sure we have some alignment - we'll
// go with the minimum SlotSize.
uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment.
unsigned StackAlign = getStackAlignment();
if (MF.getFunction()->hasFnAttribute("stackrealign")) {
- if (MFI->hasCalls())
+ if (MFI.hasCalls())
MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
else if (MaxAlign < SlotSize)
MaxAlign = SlotSize;
@@ -909,18 +910,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
"MF used frame lowering for wrong subtarget");
MachineBasicBlock::iterator MBBI = MBB.begin();
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
const Function *Fn = MF.getFunction();
MachineModuleInfo &MMI = MF.getMMI();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
- uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate.
+ uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
bool IsFunclet = MBB.isEHFuncletEntry();
EHPersonality Personality = EHPersonality::Unknown;
if (Fn->hasPersonalityFn())
Personality = classifyEHPersonality(Fn->getPersonalityFn());
bool FnHasClrFunclet =
- MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
+ MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
bool HasFP = hasFP(MF);
bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv());
@@ -933,6 +934,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
STI.isTarget64BitILP32()
? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
unsigned BasePtr = TRI->getBaseRegister();
+ bool HasWinCFI = false;
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
@@ -964,16 +966,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// push and pop from the stack.
if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
!TRI->needsStackRealignment(MF) &&
- !MFI->hasVarSizedObjects() && // No dynamic alloca.
- !MFI->adjustsStack() && // No calls.
- !IsWin64CC && // Win64 has no Red Zone
- !MFI->hasCopyImplyingStackAdjustment() && // Don't push and pop.
- !MF.shouldSplitStack()) { // Regular stack
+ !MFI.hasVarSizedObjects() && // No dynamic alloca.
+ !MFI.adjustsStack() && // No calls.
+ !IsWin64CC && // Win64 has no Red Zone
+ !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
+ !MF.shouldSplitStack()) { // Regular stack
uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
if (HasFP) MinSize += SlotSize;
X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
- MFI->setStackSize(StackSize);
+ MFI.setStackSize(StackSize);
}
// Insert stack pointer adjustment for later moving of return addr. Only
@@ -1037,9 +1039,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
// Update the frame offset adjustment.
if (!IsFunclet)
- MFI->setOffsetAdjustment(-NumBytes);
+ MFI.setOffsetAdjustment(-NumBytes);
else
- assert(MFI->getOffsetAdjustment() == -(int)NumBytes &&
+ assert(MFI.getOffsetAdjustment() == -(int)NumBytes &&
"should calculate same local variable offset for funclets");
// Save EBP/RBP into the appropriate stack slot.
@@ -1061,6 +1063,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
}
if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
.addImm(FramePtr)
.setMIFlag(MachineInstr::FrameSetup);
@@ -1122,6 +1125,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
}
if (NeedsWinCFI) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
MachineInstr::FrameSetup);
}
@@ -1207,10 +1211,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
}
- if (NeedsWinCFI && NumBytes)
+ if (NeedsWinCFI && NumBytes) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
.addImm(NumBytes)
.setMIFlag(MachineInstr::FrameSetup);
+ }
int SEHFrameOffset = 0;
unsigned SPOrEstablisher;
@@ -1257,6 +1263,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// If this is not a funclet, emit the CFI describing our frame pointer.
if (NeedsWinCFI && !IsFunclet) {
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
.addImm(FramePtr)
.addImm(SEHFrameOffset)
@@ -1293,6 +1300,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
Offset += SEHFrameOffset;
+ HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
.addImm(Reg)
.addImm(Offset)
@@ -1302,7 +1310,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
}
}
- if (NeedsWinCFI)
+ if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
.setMIFlag(MachineInstr::FrameSetup);
@@ -1394,13 +1402,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (Fn->getCallingConv() == CallingConv::X86_INTR)
BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
.setMIFlag(MachineInstr::FrameSetup);
+
+ // At this point we know if the function has WinCFI or not.
+ MF.setHasWinCFI(HasWinCFI);
}
bool X86FrameLowering::canUseLEAForSPInEpilogue(
const MachineFunction &MF) const {
- // We can't use LEA instructions for adjusting the stack pointer if this is a
- // leaf function in the Win64 ABI. Only ADD instructions may be used to
- // deallocate the stack.
+ // We can't use LEA instructions for adjusting the stack pointer if we don't
+ // have a frame pointer in the Win64 ABI. Only ADD instructions may be used
+ // to deallocate the stack.
// This means that we can use LEA for SP in two situations:
// 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
// 2. We *have* a frame pointer which means we are permitted to use LEA.
@@ -1457,7 +1468,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
} else {
// Other funclets just need enough stack for outgoing call arguments.
- UsedSize = MF.getFrameInfo()->getMaxCallFrameSize();
+ UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
}
// RBP is not included in the callee saved register block. After pushing RBP,
// everything is 16 byte aligned. Everything we allocate before an outgoing
@@ -1477,10 +1488,12 @@ static bool isTailCallOpcode(unsigned Opc) {
void X86FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
- unsigned RetOpcode = MBBI->getOpcode();
+ Optional<unsigned> RetOpcode;
+ if (MBBI != MBB.end())
+ RetOpcode = MBBI->getOpcode();
DebugLoc DL;
if (MBBI != MBB.end())
DL = MBBI->getDebugLoc();
@@ -1493,16 +1506,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsWinCFI =
IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
- bool IsFunclet = isFuncletReturnInstr(*MBBI);
+ bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);
MachineBasicBlock *TargetMBB = nullptr;
// Get the number of bytes to allocate from the FrameInfo.
- uint64_t StackSize = MFI->getStackSize();
+ uint64_t StackSize = MFI.getStackSize();
uint64_t MaxAlign = calculateMaxStackAlign(MF);
unsigned CSSize = X86FI->getCalleeSavedFrameSize();
uint64_t NumBytes = 0;
- if (MBBI->getOpcode() == X86::CATCHRET) {
+ if (RetOpcode && *RetOpcode == X86::CATCHRET) {
// SEH shouldn't use catchret.
assert(!isAsynchronousEHPersonality(
classifyEHPersonality(MF.getFunction()->getPersonalityFn())) &&
@@ -1516,7 +1529,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
MachineFramePtr)
.setMIFlag(MachineInstr::FrameDestroy);
- } else if (MBBI->getOpcode() == X86::CLEANUPRET) {
+ } else if (RetOpcode && *RetOpcode == X86::CLEANUPRET) {
NumBytes = getWinEHFuncletFrameSize(MF);
assert(hasFP(MF) && "EH funclets without FP not yet implemented");
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
@@ -1541,19 +1554,22 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
uint64_t SEHStackAllocAmt = NumBytes;
+ MachineBasicBlock::iterator FirstCSPop = MBBI;
// Skip the callee-saved pop instructions.
while (MBBI != MBB.begin()) {
MachineBasicBlock::iterator PI = std::prev(MBBI);
unsigned Opc = PI->getOpcode();
- if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
- (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
- Opc != X86::DBG_VALUE && !PI->isTerminator())
- break;
+ if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
+ if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+ (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)))
+ break;
+ FirstCSPop = PI;
+ }
--MBBI;
}
- MachineBasicBlock::iterator FirstCSPop = MBBI;
+ MBBI = FirstCSPop;
if (TargetMBB) {
// Fill EAX/RAX with the address of the target block.
@@ -1581,14 +1597,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// If there is an ADD32ri or SUB32ri of ESP immediately before this
// instruction, merge the two instructions.
- if (NumBytes || MFI->hasVarSizedObjects())
+ if (NumBytes || MFI.hasVarSizedObjects())
NumBytes += mergeSPUpdates(MBB, MBBI, true);
// If dynamic alloca is used, then reset esp to point to the last callee-saved
// slot before popping them off! Same applies for the case, when stack was
// realigned. Don't do this if this was a funclet epilogue, since the funclets
// will not do realignment or dynamic stack allocation.
- if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) &&
+ if ((TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) &&
!IsFunclet) {
if (TRI->needsStackRealignment(MF))
MBBI = FirstCSPop;
@@ -1626,10 +1642,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// into the epilogue. To cope with that, we insert an epilogue marker here,
// then replace it with a 'nop' if it ends up immediately after a CALL in the
// final emitted code.
- if (NeedsWinCFI)
+ if (NeedsWinCFI && MF.hasWinCFI())
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
- if (!isTailCallOpcode(RetOpcode)) {
+ if (!RetOpcode || !isTailCallOpcode(*RetOpcode)) {
// Add the return addr area delta back since we are not tail calling.
int Offset = -1 * X86FI->getTCReturnAddrDelta();
assert(Offset >= 0 && "TCDelta should never be positive");
@@ -1649,7 +1665,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// (probably?) it should be moved into here.
int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
// We can't calculate offset from frame pointer if the stack is realigned,
// so enforce usage of stack/base pointer. The base pointer is used when we
@@ -1665,16 +1681,16 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
// object.
// We need to factor in additional offsets applied during the prologue to the
// frame, base, and stack pointer depending on which is used.
- int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+ int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
unsigned CSSize = X86FI->getCalleeSavedFrameSize();
- uint64_t StackSize = MFI->getStackSize();
+ uint64_t StackSize = MFI.getStackSize();
bool HasFP = hasFP(MF);
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
int64_t FPDelta = 0;
if (IsWin64Prologue) {
- assert(!MFI->hasCalls() || (StackSize % 16) == 8);
+ assert(!MFI.hasCalls() || (StackSize % 16) == 8);
// Calculate required stack adjustment.
uint64_t FrameSize = StackSize - SlotSize;
@@ -1692,7 +1708,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
// restricted Win64 prologue.
// Add FPDelta to all offsets below that go through the frame pointer.
FPDelta = FrameSize - SEHFrameOffset;
- assert((!MFI->hasCalls() || (FPDelta % 16) == 0) &&
+ assert((!MFI.hasCalls() || (FPDelta % 16) == 0) &&
"FPDelta isn't aligned per the Win64 ABI!");
}
@@ -1703,7 +1719,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
// Skip the saved EBP.
return Offset + SlotSize + FPDelta;
} else {
- assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+ assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
return Offset + StackSize;
}
} else if (TRI->needsStackRealignment(MF)) {
@@ -1711,7 +1727,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
// Skip the saved EBP.
return Offset + SlotSize + FPDelta;
} else {
- assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+ assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
return Offset + StackSize;
}
// FIXME: Support tail calls
@@ -1736,9 +1752,9 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
int FI, unsigned &FrameReg,
bool IgnoreSPUpdates) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
// Does not include any dynamic realign.
- const uint64_t StackSize = MFI->getStackSize();
+ const uint64_t StackSize = MFI.getStackSize();
// LLVM arranges the stack as follows:
// ...
// ARG2
@@ -1772,7 +1788,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
// answer we give is relative to the SP after the prologue, and not the
// SP in the middle of the function.
- if (MFI->isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
+ if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
!STI.isTargetWin64())
return getFrameIndexReference(MF, FI, FrameReg);
@@ -1804,7 +1820,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
//
// A is the incoming stack pointer.
// (B - A) is the local area offset (-8 for x86-64) [1]
- // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2]
+ // (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
//
// |(E - B)| is the StackSize (absolute value, positive). For a
// stack that grown down, this works out to be (B - E). [3]
@@ -1817,7 +1833,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
//
// Get the Offset from the StackPointer
- int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+ int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
return Offset + StackSize;
}
@@ -1825,7 +1841,7 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
bool X86FrameLowering::assignCalleeSavedSpillSlots(
MachineFunction &MF, const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const {
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
unsigned CalleeSavedFrameSize = 0;
@@ -1834,7 +1850,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
if (hasFP(MF)) {
// emitPrologue always spills frame register the first thing.
SpillSlotOffset -= SlotSize;
- MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
// Since emitPrologue and emitEpilogue will handle spilling and restoring of
// the frame register, we can delete it from CSI list and not have to worry
@@ -1858,7 +1874,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
SpillSlotOffset -= SlotSize;
CalleeSavedFrameSize += SlotSize;
- int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+ int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
CSI[i - 1].setFrameIdx(SlotIndex);
}
@@ -1876,9 +1892,9 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
// spill into slot
SpillSlotOffset -= RC->getSize();
int SlotIndex =
- MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+ MFI.CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
CSI[i - 1].setFrameIdx(SlotIndex);
- MFI->ensureMaxAlignment(RC->getAlignment());
+ MFI.ensureMaxAlignment(RC->getAlignment());
}
return true;
@@ -1957,7 +1973,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
if (CSI.empty())
return false;
- if (isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
+ if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
// Don't restore CSRs in 32-bit EH funclets. Matches
// spillCalleeSavedRegisters.
if (STI.is32Bit())
@@ -2005,7 +2021,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
RegScavenger *RS) const {
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
@@ -2020,7 +2036,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
// ...
// }
// [EBP]
- MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+ MFI.CreateFixedObject(-TailCallReturnAddrDelta,
TailCallReturnAddrDelta - SlotSize, true);
}
@@ -2029,8 +2045,8 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(TRI->getBaseRegister());
// Allocate a spill slot for EBP if we have a base pointer and EH funclets.
- if (MF.getMMI().hasEHFunclets()) {
- int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize);
+ if (MF.hasEHFunclets()) {
+ int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
X86FI->setHasSEHFramePtrSave(true);
X86FI->setSEHFramePtrSaveIndex(FI);
}
@@ -2091,7 +2107,7 @@ static const uint64_t kSplitStackAvailable = 256;
void X86FrameLowering::adjustForSegmentedStacks(
MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
uint64_t StackSize;
unsigned TlsReg, TlsOffset;
DebugLoc DL;
@@ -2114,7 +2130,7 @@ void X86FrameLowering::adjustForSegmentedStacks(
// Eventually StackSize will be calculated by a link-time pass; which will
// also decide whether checking code needs to be injected into this particular
// prologue.
- StackSize = MFI->getStackSize();
+ StackSize = MFI.getStackSize();
// Do not generate a prologue for functions with a stack of size zero
if (StackSize == 0)
@@ -2360,7 +2376,7 @@ static unsigned getHiPELiteral(
/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
void X86FrameLowering::adjustForHiPEPrologue(
MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
DebugLoc DL;
// To support shrink-wrapping we would need to insert the new blocks
@@ -2380,7 +2396,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
const unsigned Guaranteed = HipeLeafWords * SlotSize;
unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
- unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
+ unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize;
assert(STI.isTargetLinux() &&
"HiPE prologue is only supported on Linux operating systems.");
@@ -2392,7 +2408,7 @@ void X86FrameLowering::adjustForHiPEPrologue(
// b) outgoing on-stack parameter areas, and
// c) the minimum stack space this function needs to make available for the
// functions it calls (a tunable ABI property).
- if (MFI->hasCalls()) {
+ if (MFI.hasCalls()) {
unsigned MoreStackForCalls = 0;
for (auto &MBB : MF) {
@@ -2574,6 +2590,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
I = MBB.erase(I);
+ auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
if (!reserveCallFrame) {
// If the stack pointer can be changed after prologue, turn the
@@ -2599,12 +2616,11 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// GNU_ARGS_SIZE.
// TODO: We don't need to reset this between subsequent functions,
// if it didn't change.
- bool HasDwarfEHHandlers = !WindowsCFI &&
- !MF.getMMI().getLandingPads().empty();
+ bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();
if (HasDwarfEHHandlers && !isDestroy &&
MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
- BuildCFI(MBB, I, DL,
+ BuildCFI(MBB, InsertPos, DL,
MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
if (Amount == 0)
@@ -2618,7 +2634,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// If this is a callee-pop calling convention, emit a CFA adjust for
// the amount the callee popped.
if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
- BuildCFI(MBB, I, DL,
+ BuildCFI(MBB, InsertPos, DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
// Add Amount to SP to destroy a frame, or subtract to setup.
@@ -2629,13 +2645,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// Merge with any previous or following adjustment instruction. Note: the
// instructions merged with here do not have CFI, so their stack
// adjustments do not feed into CfaAdjustment.
- StackAdjustment += mergeSPUpdates(MBB, I, true);
- StackAdjustment += mergeSPUpdates(MBB, I, false);
+ StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
+ StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
if (StackAdjustment) {
if (!(Fn->optForMinSize() &&
- adjustStackWithPops(MBB, I, DL, StackAdjustment)))
- BuildStackAdjustment(MBB, I, DL, StackAdjustment,
+ adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
+ BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
/*InEpilogue=*/false);
}
}
@@ -2651,8 +2667,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// TODO: When not using precise CFA, we also need to adjust for the
// InternalAmt here.
if (CfaAdjustment) {
- BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(
- nullptr, CfaAdjustment));
+ BuildCFI(MBB, InsertPos, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr,
+ CfaAdjustment));
}
}
@@ -2728,12 +2745,12 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
unsigned BasePtr = TRI->getBaseRegister();
WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
// FIXME: Don't set FrameSetup flag in catchret case.
int FI = FuncInfo.EHRegNodeFrameIndex;
- int EHRegSize = MFI->getObjectSize(FI);
+ int EHRegSize = MFI.getObjectSize(FI);
if (RestoreSP) {
// MOV32rm -EHRegSize(%ebp), %esp
@@ -2850,7 +2867,7 @@ struct X86FrameSortingComparator {
// of uses and size of object in order to minimize code size.
void X86FrameLowering::orderFrameObjects(
const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
// Don't waste time if there's nothing to do.
if (ObjectsToAllocate.empty())
@@ -2861,16 +2878,16 @@ void X86FrameLowering::orderFrameObjects(
// it easier to index into when we're counting "uses" down below.
// We want to be able to easily/cheaply access an object by simply
// indexing into it, instead of having to search for it every time.
- std::vector<X86FrameSortingObject> SortingObjects(MFI->getObjectIndexEnd());
+ std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());
// Walk the objects we care about and mark them as such in our working
// struct.
for (auto &Obj : ObjectsToAllocate) {
SortingObjects[Obj].IsValid = true;
SortingObjects[Obj].ObjectIndex = Obj;
- SortingObjects[Obj].ObjectAlignment = MFI->getObjectAlignment(Obj);
+ SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj);
// Set the size.
- int ObjectSize = MFI->getObjectSize(Obj);
+ int ObjectSize = MFI.getObjectSize(Obj);
if (ObjectSize == 0)
// Variable size. Just use 4.
SortingObjects[Obj].ObjectSize = 4;
@@ -2890,7 +2907,7 @@ void X86FrameLowering::orderFrameObjects(
int Index = MO.getIndex();
// Check to see if it falls within our range, and is tagged
// to require ordering.
- if (Index >= 0 && Index < MFI->getObjectIndexEnd() &&
+ if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
SortingObjects[Index].IsValid)
SortingObjects[Index].ObjectNumUses++;
}
@@ -2938,7 +2955,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
const Function *Fn = MF.getFunction();
- if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() ||
+ if (!STI.is64Bit() || !MF.hasEHFunclets() ||
classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX)
return;
@@ -2947,21 +2964,21 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
// object, so that we can allocate a slot immediately following it. If there
// were no fixed objects, use offset -SlotSize, which is immediately after the
// return address. Fixed objects have negative frame indices.
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
int64_t MinFixedObjOffset = -SlotSize;
- for (int I = MFI->getObjectIndexBegin(); I < 0; ++I)
- MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I));
+ for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)
+ MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));
for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
for (WinEHHandlerType &H : TBME.HandlerArray) {
int FrameIndex = H.CatchObj.FrameIndex;
if (FrameIndex != INT_MAX) {
// Ensure alignment.
- unsigned Align = MFI->getObjectAlignment(FrameIndex);
+ unsigned Align = MFI.getObjectAlignment(FrameIndex);
MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
- MinFixedObjOffset -= MFI->getObjectSize(FrameIndex);
- MFI->setObjectOffset(FrameIndex, MinFixedObjOffset);
+ MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
+ MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
}
}
}
@@ -2970,7 +2987,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
int UnwindHelpFI =
- MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+ MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
// Store -2 into UnwindHelp on function entry. We have to scan forwards past
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
index 4a01014..e1b04d6 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -49,11 +49,10 @@ public:
/// Emit target stack probe code. This is required for all
/// large stack allocations on Windows. The caller is required to materialize
- /// the number of bytes to probe in RAX/EAX. Returns instruction just
- /// after the expansion.
- MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, bool InProlog) const;
+ /// the number of bytes to probe in RAX/EAX.
+ void emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ bool InProlog) const;
/// Replace a StackProbe inline-stub with the actual probe code inline.
void inlineStackProbe(MachineFunction &MF,
@@ -179,22 +178,19 @@ private:
uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
/// Emit target stack probe as a call to a helper function
- MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, bool InProlog) const;
+ void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ bool InProlog) const;
/// Emit target stack probe as an inline sequence.
- MachineInstr *emitStackProbeInline(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, bool InProlog) const;
+ void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const;
/// Emit a stub to later inline the target stack probe.
- MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL,
- bool InProlog) const;
+ void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const;
/// Aligns the stack pointer by ANDing it with -MaxAlign.
void BuildStackAlignAND(MachineBasicBlock &MBB,
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 7d53b3d..8ab4c06 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
@@ -165,7 +166,7 @@ namespace {
: SelectionDAGISel(tm, OptLevel), OptForSize(false),
OptForMinSize(false) {}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "X86 DAG->DAG Instruction Selection";
}
@@ -182,16 +183,6 @@ namespace {
void PreprocessISelDAG() override;
- inline bool immSext8(SDNode *N) const {
- return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
- }
-
- // True if the 64-bit immediate fits in a 32-bit sign-extended field.
- inline bool i64immSExt32(SDNode *N) const {
- uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
- return (int64_t)v == (int32_t)v;
- }
-
// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"
@@ -228,6 +219,7 @@ namespace {
SDValue &Index, SDValue &Disp,
SDValue &Segment,
SDValue &NodeWithChain);
+ bool selectRelocImm(SDValue N, SDValue &Op);
bool tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
@@ -1234,7 +1226,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
case ISD::UMUL_LOHI:
// A mul_lohi where we need the low part can be folded as a plain multiply.
if (N.getResNo() != 0) break;
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case ISD::MUL:
case X86ISD::MUL_IMM:
// X*[3,5,9] -> X+X*[2,4,8]
@@ -1435,7 +1427,7 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDLoc DL(N);
Base = Mgs->getBasePtr();
Index = Mgs->getIndex();
- unsigned ScalarSize = Mgs->getValue().getValueType().getScalarSizeInBits();
+ unsigned ScalarSize = Mgs->getValue().getScalarValueSizeInBits();
Scale = getI8Imm(ScalarSize/8, DL);
// If Base is 0, the whole address is in index and the Scale is 1
@@ -1512,16 +1504,39 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment,
SDValue &PatternNodeWithChain) {
- if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ // We can allow a full vector load here since narrowing a load is ok.
+ if (ISD::isNON_EXTLoad(N.getNode())) {
+ PatternNodeWithChain = N;
+ if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+ IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+ Segment);
+ }
+ }
+
+ // We can also match the special zero extended load opcode.
+ if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
+ PatternNodeWithChain = N;
+ if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+ IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
+ auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
+ return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
+ Segment);
+ }
+ }
+
+ // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
+ // once. Otherwise the load might get duplicated and the chain output of the
+ // duplicate load will not be observed by all dependencies.
+ if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
PatternNodeWithChain = N.getOperand(0);
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
- PatternNodeWithChain.hasOneUse() &&
- IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
- IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+ IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
- if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
- return false;
- return true;
+ return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+ Segment);
}
}
@@ -1530,18 +1545,18 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
// Check to see if the top elements are all zeros (or bitcast of zeros).
N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
- N.getOperand(0).getNode()->hasOneUse() &&
- ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
- N.getOperand(0).getOperand(0).hasOneUse() &&
- IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
- IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
- // Okay, this is a zero extending load. Fold it.
- LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
- if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
- return false;
- PatternNodeWithChain = SDValue(LD, 0);
- return true;
+ N.getOperand(0).getNode()->hasOneUse()) {
+ PatternNodeWithChain = N.getOperand(0).getOperand(0);
+ if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+ IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
+ // Okay, this is a zero extending load. Fold it.
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+ Segment);
+ }
}
+
return false;
}
@@ -1563,16 +1578,21 @@ bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
"Unexpected node type for MOV32ri64");
N = N.getOperand(0);
- if (N->getOpcode() != ISD::TargetConstantPool &&
- N->getOpcode() != ISD::TargetJumpTable &&
- N->getOpcode() != ISD::TargetGlobalAddress &&
- N->getOpcode() != ISD::TargetExternalSymbol &&
- N->getOpcode() != ISD::MCSymbol &&
- N->getOpcode() != ISD::TargetBlockAddress)
+ // At least GNU as does not accept 'movl' for TPOFF relocations.
+ // FIXME: We could use 'movl' when we know we are targeting MC.
+ if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
return false;
Imm = N;
- return TM.getCodeModel() == CodeModel::Small;
+ if (N->getOpcode() != ISD::TargetGlobalAddress)
+ return TM.getCodeModel() == CodeModel::Small;
+
+ Optional<ConstantRange> CR =
+ cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
+ if (!CR)
+ return TM.getCodeModel() == CodeModel::Small;
+
+ return CR->getUnsignedMax().ult(1ull << 32);
}
bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
@@ -1704,6 +1724,48 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
return true;
}
+bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
+ if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
+ Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
+ N.getValueType());
+ return true;
+ }
+
+ // Keep track of the original value type and whether this value was
+ // truncated. If we see a truncation from pointer type to VT that truncates
+ // bits that are known to be zero, we can use a narrow reference.
+ EVT VT = N.getValueType();
+ bool WasTruncated = false;
+ if (N.getOpcode() == ISD::TRUNCATE) {
+ WasTruncated = true;
+ N = N.getOperand(0);
+ }
+
+ if (N.getOpcode() != X86ISD::Wrapper)
+ return false;
+
+ // We can only use non-GlobalValues as immediates if they were not truncated,
+ // as we do not have any range information. If we have a GlobalValue and the
+ // address was not truncated, we can select it as an operand directly.
+ unsigned Opc = N.getOperand(0)->getOpcode();
+ if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
+ Op = N.getOperand(0);
+ // We can only select the operand directly if we didn't have to look past a
+ // truncate.
+ return !WasTruncated;
+ }
+
+ // Check that the global's range fits into VT.
+ auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
+ Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+ if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
+ return false;
+
+ // Okay, we can use a narrow reference.
+ Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
+ GA->getOffset(), GA->getTargetFlags());
+ return true;
+}
bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
@@ -2700,7 +2762,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
case InlineAsm::Constraint_i:
// FIXME: It seems strange that 'i' is needed here since it's supposed to
// be an immediate and not a memory constraint.
- // Fallthrough.
+ LLVM_FALLTHROUGH;
case InlineAsm::Constraint_o: // offsetable ??
case InlineAsm::Constraint_v: // not offsetable ??
case InlineAsm::Constraint_m: // memory
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index f499e56..08fe2ba 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17,6 +17,7 @@
#include "X86CallingConv.h"
#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
+#include "X86IntrinsicsInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86ShuffleDecodeConstantPool.h"
#include "X86TargetMachine.h"
@@ -53,10 +54,10 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
-#include "X86IntrinsicsInfo.h"
+#include <algorithm>
#include <bitset>
-#include <numeric>
#include <cctype>
+#include <numeric>
using namespace llvm;
#define DEBUG_TYPE "x86-isel"
@@ -96,15 +97,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
- // Bypass expensive divides on Atom when compiling with O2.
+ // Bypass expensive divides and use cheaper ones.
if (TM.getOptLevel() >= CodeGenOpt::Default) {
if (Subtarget.hasSlowDivide32())
addBypassSlowDiv(32, 8);
if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
- addBypassSlowDiv(64, 16);
+ addBypassSlowDiv(64, 32);
}
- if (Subtarget.isTargetKnownWindowsMSVC()) {
+ if (Subtarget.isTargetKnownWindowsMSVC() ||
+ Subtarget.isTargetWindowsItanium()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
@@ -286,7 +288,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
+ }
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
// Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
setOperationAction(ISD::ADDC, VT, Custom);
setOperationAction(ISD::ADDE, VT, Custom);
@@ -349,7 +355,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
- if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
+ if (Subtarget.useSoftFloat() ||
+ (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
@@ -484,8 +491,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
- addRegisterClass(MVT::f32, &X86::FR32RegClass);
- addRegisterClass(MVT::f64, &X86::FR64RegClass);
+ addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
+ : &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
+ : &X86::FR64RegClass);
for (auto VT : { MVT::f32, MVT::f64 }) {
// Use ANDPD to simulate FABS.
@@ -514,7 +523,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
} else if (UseX87 && X86ScalarSSEf32) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
- addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
+ : &X86::FR32RegClass);
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
// Use ANDPS to simulate FABS.
@@ -590,14 +600,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
{
- APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
+ APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
addLegalFPImmediate(TmpFlt); // FLD0
TmpFlt.changeSign();
addLegalFPImmediate(TmpFlt); // FLD0/FCHS
bool ignored;
APFloat TmpFlt2(+1.0);
- TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+ TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
&ignored);
addLegalFPImmediate(TmpFlt2); // FLD1
TmpFlt2.changeSign();
@@ -717,10 +727,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
- addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
+ addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
@@ -730,14 +742,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
- addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
+ addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
// registers cannot be used even for integer operations.
- addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
- addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
- addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
- addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
+ addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+ addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
@@ -751,6 +768,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
@@ -776,7 +794,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
- // ISD::CTTZ v2i64 - scalarization is faster.
+ setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
@@ -828,16 +846,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
- // As there is no 64-bit GPR available, we need build a special custom
- // sequence to convert from v2i32 to v2f32.
- if (!Subtarget.is64Bit())
- setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+
+ // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
@@ -872,8 +891,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
- // ISD::CTLZ v4i32 - scalarization is faster.
- // ISD::CTLZ v2i64 - scalarization is faster.
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
@@ -946,12 +965,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
bool HasInt256 = Subtarget.hasInt256();
- addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
- addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
- addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
- addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
- addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
- addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
+ addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
@@ -961,6 +986,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
@@ -1011,16 +1037,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
- }
-
- // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
- // as we end up splitting the 256-bit vectors.
- for (auto VT : { MVT::v32i8, MVT::v16i16 })
setOperationAction(ISD::CTLZ, VT, Custom);
-
- if (HasInt256)
- for (auto VT : { MVT::v8i32, MVT::v4i64 })
- setOperationAction(ISD::CTLZ, VT, Custom);
+ }
if (Subtarget.hasAnyFMA()) {
for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
@@ -1171,12 +1189,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
@@ -1216,10 +1236,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
} else {
- setOperationAction(ISD::MLOAD, MVT::v8i32, Custom);
- setOperationAction(ISD::MLOAD, MVT::v8f32, Custom);
- setOperationAction(ISD::MSTORE, MVT::v8i32, Custom);
- setOperationAction(ISD::MSTORE, MVT::v8f32, Custom);
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ }
}
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
@@ -1230,18 +1251,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
if (Subtarget.hasDQI()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+
if (Subtarget.hasVLX()) {
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+ // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
}
}
if (Subtarget.hasVLX()) {
@@ -1250,11 +1276,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
@@ -1281,10 +1308,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
- if (Subtarget.hasDQI()) {
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
- }
+
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
@@ -1293,6 +1317,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, VT, Legal);
}
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
+
+ // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
@@ -1339,13 +1370,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::AND, VT, Legal);
- setOperationAction(ISD::OR, VT, Legal);
- setOperationAction(ISD::XOR, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
}
+ // Need to promote to 64-bit even though we have 32-bit masked instructions
+ // because the IR optimizers rearrange bitcasts around logic ops leaving
+ // too many variations to handle if we don't promote them.
+ setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
+ setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
+ setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
+
if (Subtarget.hasCDI()) {
setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
@@ -1377,12 +1412,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
} // Subtarget.hasCDI()
if (Subtarget.hasDQI()) {
- if (Subtarget.hasVLX()) {
- setOperationAction(ISD::MUL, MVT::v2i64, Legal);
- setOperationAction(ISD::MUL, MVT::v4i64, Legal);
- }
+ // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+ setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i64, Legal);
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
+
// Custom lower several nodes.
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
@@ -1413,6 +1448,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MSCATTER, VT, Custom);
}
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
+ setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
}
}// has AVX-512
@@ -1447,6 +1483,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
@@ -1486,10 +1524,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
+
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
- setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
- if (Subtarget.hasVLX())
+ if (Subtarget.hasVLX()) {
+ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+ }
LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
@@ -1532,35 +1573,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
- setOperationAction(ISD::ADD, MVT::v2i1, Expand);
- setOperationAction(ISD::ADD, MVT::v4i1, Expand);
- setOperationAction(ISD::SUB, MVT::v2i1, Expand);
- setOperationAction(ISD::SUB, MVT::v4i1, Expand);
- setOperationAction(ISD::MUL, MVT::v2i1, Expand);
- setOperationAction(ISD::MUL, MVT::v4i1, Expand);
-
- setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
- setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
- setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
+ for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
+ setOperationAction(ISD::ADD, VT, Expand);
+ setOperationAction(ISD::SUB, VT, Expand);
+ setOperationAction(ISD::MUL, VT, Expand);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ }
+
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
- setOperationAction(ISD::SELECT, MVT::v4i1, Custom);
- setOperationAction(ISD::SELECT, MVT::v2i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom);
- setOperationAction(ISD::VSELECT, MVT::v2i1, Expand);
- setOperationAction(ISD::VSELECT, MVT::v4i1, Expand);
-
- for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
- setOperationAction(ISD::AND, VT, Legal);
- setOperationAction(ISD::OR, VT, Legal);
- setOperationAction(ISD::XOR, VT, Legal);
- }
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
@@ -1629,7 +1660,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
// function casting to f64 and calling `fmod`.
- if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
+ if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
+ Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
ISD::FLOG10, ISD::FPOW, ISD::FSIN})
@@ -1953,9 +1985,11 @@ X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
case MVT::f32: case MVT::f64:
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
case MVT::v4f32: case MVT::v2f64:
- case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
- case MVT::v4f64:
- RRC = &X86::VR128RegClass;
+ case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
+ case MVT::v8f32: case MVT::v4f64:
+ case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
+ case MVT::v16f32: case MVT::v8f64:
+ RRC = &X86::VR128XRegClass;
break;
}
return std::make_pair(RRC, Cost);
@@ -2019,6 +2053,9 @@ Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
}
Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (Subtarget.getTargetTriple().isOSContiki())
+ return getDefaultSafeStackPointerLocation(IRB, false);
+
if (!Subtarget.isTargetAndroid())
return TargetLowering::getSafeStackPointerLocation(IRB);
@@ -2062,6 +2099,58 @@ const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
return ScratchRegs;
}
+/// Lowers masks values (v*i1) to the local register values
+/// \returns DAG node after lowering to register type
+static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
+ const SDLoc &Dl, SelectionDAG &DAG) {
+ EVT ValVT = ValArg.getValueType();
+
+ if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
+ (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
+ // Two stage lowering might be required
+ // bitcast: v8i1 -> i8 / v16i1 -> i16
+ // anyextend: i8 -> i32 / i16 -> i32
+ EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
+ SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
+ if (ValLoc == MVT::i32)
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
+ return ValToCopy;
+ } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
+ (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
+ // One stage lowering is required
+ // bitcast: v32i1 -> i32 / v64i1 -> i64
+ return DAG.getBitcast(ValLoc, ValArg);
+ } else
+ return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
+}
+
+/// Breaks v64i1 value into two registers and adds the new node to the DAG
+static void Passv64i1ArgInRegs(
+ const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
+ SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
+ CCValAssign &NextVA, const X86Subtarget &Subtarget) {
+ assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
+ "Expected AVX512BW or AVX512BMI target!");
+ assert(Subtarget.is32Bit() && "Expecting 32 bit target");
+ assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
+ assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+ "The value should reside in two registers");
+
+ // Before splitting the value we cast it to i64
+ Arg = DAG.getBitcast(MVT::i64, Arg);
+
+ // Splitting the value into two i32 types
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
+ DAG.getConstant(0, Dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
+ DAG.getConstant(1, Dl, MVT::i32));
+
+ // Attach the two i32 types into corresponding registers
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
+ RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
+}
+
SDValue
X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@@ -2086,10 +2175,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
MVT::i32));
// Copy the result values into the output registers.
- for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
- CCValAssign &VA = RVLocs[i];
+ for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
+ ++I, ++OutsIndex) {
+ CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
- SDValue ValToCopy = OutVals[i];
+ SDValue ValToCopy = OutVals[OutsIndex];
EVT ValVT = ValToCopy.getValueType();
// Promote values to the appropriate types.
@@ -2099,7 +2189,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::AExt) {
if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
- ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
else
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
}
@@ -2152,9 +2242,27 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
}
- Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
- Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+
+ Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
+ Subtarget);
+
+ assert(2 == RegsToPass.size() &&
+ "Expecting two registers after Pass64BitArgInRegs");
+ } else {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
+ }
+
+ // Add nodes to the DAG and add the values into the RetOps list
+ for (auto &Reg : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+ }
}
// Swift calling convention does not require we copy the sret argument
@@ -2282,6 +2390,98 @@ EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
return VT.bitsLT(MinVT) ? MinVT : VT;
}
+/// Reads two 32 bit registers and creates a 64 bit mask value.
+/// \param VA The current 32 bit value that need to be assigned.
+/// \param NextVA The next 32 bit value that need to be assigned.
+/// \param Root The parent DAG node.
+/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
+/// glue purposes. In the case the DAG is already using
+/// physical register instead of virtual, we should glue
+/// our new SDValue to InFlag SDvalue.
+/// \return a new SDvalue of size 64bit.
+static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
+ SDValue &Root, SelectionDAG &DAG,
+ const SDLoc &Dl, const X86Subtarget &Subtarget,
+ SDValue *InFlag = nullptr) {
+ assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
+ assert(Subtarget.is32Bit() && "Expecting 32 bit target");
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Expecting first location of 64 bit width type");
+ assert(NextVA.getValVT() == VA.getValVT() &&
+ "The locations should have the same type");
+ assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+ "The values should reside in two registers");
+
+ SDValue Lo, Hi;
+ unsigned Reg;
+ SDValue ArgValueLo, ArgValueHi;
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetRegisterClass *RC = &X86::GR32RegClass;
+
+ // Read a 32 bit value from the registers
+ if (nullptr == InFlag) {
+ // When no physical register is present,
+ // create an intermediate virtual register
+ Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
+ Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+ ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
+ } else {
+ // When a physical register is available read the value from it and glue
+ // the reads together.
+ ArgValueLo =
+ DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
+ *InFlag = ArgValueLo.getValue(2);
+ ArgValueHi =
+ DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
+ *InFlag = ArgValueHi.getValue(2);
+ }
+
+ // Convert the i32 type into v32i1 type
+ Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
+
+ // Convert the i32 type into v32i1 type
+ Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
+
+ // Concantenate the two values together
+ return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
+}
+
+/// The function will lower a register of various sizes (8/16/32/64)
+/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
+/// \returns a DAG node contains the operand after lowering to mask type.
+static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
+ const EVT &ValLoc, const SDLoc &Dl,
+ SelectionDAG &DAG) {
+ SDValue ValReturned = ValArg;
+
+ if (ValVT == MVT::v64i1) {
+ // In 32 bit machine, this case is handled by getv64i1Argument
+ assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
+ // In 64 bit machine, There is no need to truncate the value only bitcast
+ } else {
+ MVT maskLen;
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ case MVT::v8i1:
+ maskLen = MVT::i8;
+ break;
+ case MVT::v16i1:
+ maskLen = MVT::i16;
+ break;
+ case MVT::v32i1:
+ maskLen = MVT::i32;
+ break;
+ default:
+ llvm_unreachable("Expecting a vector of i1 types");
+ }
+
+ ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
+ }
+
+ return DAG.getBitcast(ValVT, ValReturned);
+}
+
/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
///
@@ -2298,13 +2498,14 @@ SDValue X86TargetLowering::LowerCallResult(
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
- for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
- CCValAssign &VA = RVLocs[i];
+ for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
+ ++I, ++InsIndex) {
+ CCValAssign &VA = RVLocs[I];
EVT CopyVT = VA.getLocVT();
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
- ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
+ ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
report_fatal_error("SSE register return with SSE disabled");
}
@@ -2319,19 +2520,34 @@ SDValue X86TargetLowering::LowerCallResult(
RoundAfterCopy = (CopyVT != VA.getLocVT());
}
- Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
- CopyVT, InFlag).getValue(1);
- SDValue Val = Chain.getValue(0);
+ SDValue Val;
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+ Val =
+ getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
+ } else {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
+ .getValue(1);
+ Val = Chain.getValue(0);
+ InFlag = Chain.getValue(2);
+ }
if (RoundAfterCopy)
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
// This truncation won't change the value.
DAG.getIntPtrConstant(1, dl));
- if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
- Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+ if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
+ if (VA.getValVT().isVector() &&
+ ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+ (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+ // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
+ Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
+ } else
+ Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+ }
- InFlag = Chain.getValue(2);
InVals.push_back(Val);
}
@@ -2399,7 +2615,8 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
/// Return true if the calling convention is one that we can guarantee TCO for.
static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
- CC == CallingConv::HiPE || CC == CallingConv::HHVM);
+ CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
+ CC == CallingConv::HHVM);
}
/// Return true if we might ever do TCO for calls with this calling convention.
@@ -2445,7 +2662,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &dl, SelectionDAG &DAG,
const CCValAssign &VA,
- MachineFrameInfo *MFI, unsigned i) const {
+ MachineFrameInfo &MFI, unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
bool AlwaysUseMutable = shouldGuaranteeTCO(
@@ -2454,9 +2671,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
EVT ValVT;
// If value is passed by pointer we have address passed instead of the value
- // itself.
- bool ExtendedInMem = VA.isExtInLoc() &&
- VA.getValVT().getScalarType() == MVT::i1;
+ // itself. No need to extend if the mask value and location share the same
+ // absolute size.
+ bool ExtendedInMem =
+ VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
+ VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
ValVT = VA.getLocVT();
@@ -2483,26 +2702,26 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
if (Flags.isByVal()) {
unsigned Bytes = Flags.getByValSize();
if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
- int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+ int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
// Adjust SP offset of interrupt parameter.
if (CallConv == CallingConv::X86_INTR) {
- MFI->setObjectOffset(FI, Offset);
+ MFI.setObjectOffset(FI, Offset);
}
return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
} else {
- int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
- VA.getLocMemOffset(), isImmutable);
+ int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
+ VA.getLocMemOffset(), isImmutable);
// Set SExt or ZExt flag.
if (VA.getLocInfo() == CCValAssign::ZExt) {
- MFI->setObjectZExt(FI, true);
+ MFI.setObjectZExt(FI, true);
} else if (VA.getLocInfo() == CCValAssign::SExt) {
- MFI->setObjectSExt(FI, true);
+ MFI.setObjectSExt(FI, true);
}
// Adjust SP offset of interrupt parameter.
if (CallConv == CallingConv::X86_INTR) {
- MFI->setObjectOffset(FI, Offset);
+ MFI.setObjectOffset(FI, Offset);
}
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
@@ -2562,6 +2781,13 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
}
+static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
+ return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
+ [](const CCValAssign &A, const CCValAssign &B) -> bool {
+ return A.getValNo() < B.getValNo();
+ });
+}
+
SDValue X86TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -2576,12 +2802,13 @@ SDValue X86TargetLowering::LowerFormalArguments(
Fn->getName() == "main")
FuncInfo->setForceFramePointer(true);
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
bool Is64Bit = Subtarget.is64Bit();
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
- assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
- "Var args not supported with calling convention fastcc, ghc or hipe");
+ assert(
+ !(isVarArg && canGuaranteeTCO(CallConv)) &&
+ "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
if (CallConv == CallingConv::X86_INTR) {
bool isLegal = Ins.size() == 1 ||
@@ -2595,59 +2822,78 @@ SDValue X86TargetLowering::LowerFormalArguments(
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
- // Allocate shadow area for Win64
+ // Allocate shadow area for Win64.
if (IsWin64)
CCInfo.AllocateStack(32, 8);
- CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+ CCInfo.AnalyzeArguments(Ins, CC_X86);
+
+ // In vectorcall calling convention a second pass is required for the HVA
+ // types.
+ if (CallingConv::X86_VectorCall == CallConv) {
+ CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
+ }
+
+ // The next loop assumes that the locations are in the same order of the
+ // input arguments.
+ if (!isSortedByValueNo(ArgLocs))
+ llvm_unreachable("Argument Location list must be sorted before lowering");
- unsigned LastVal = ~0U;
SDValue ArgValue;
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
- // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
- // places.
- assert(VA.getValNo() != LastVal &&
- "Don't support value assigned to multiple locs yet");
- (void)LastVal;
- LastVal = VA.getValNo();
+ for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++InsIndex) {
+ assert(InsIndex < Ins.size() && "Invalid Ins index");
+ CCValAssign &VA = ArgLocs[I];
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
- const TargetRegisterClass *RC;
- if (RegVT == MVT::i32)
- RC = &X86::GR32RegClass;
- else if (Is64Bit && RegVT == MVT::i64)
- RC = &X86::GR64RegClass;
- else if (RegVT == MVT::f32)
- RC = &X86::FR32RegClass;
- else if (RegVT == MVT::f64)
- RC = &X86::FR64RegClass;
- else if (RegVT == MVT::f128)
- RC = &X86::FR128RegClass;
- else if (RegVT.is512BitVector())
- RC = &X86::VR512RegClass;
- else if (RegVT.is256BitVector())
- RC = &X86::VR256RegClass;
- else if (RegVT.is128BitVector())
- RC = &X86::VR128RegClass;
- else if (RegVT == MVT::x86mmx)
- RC = &X86::VR64RegClass;
- else if (RegVT == MVT::i1)
- RC = &X86::VK1RegClass;
- else if (RegVT == MVT::v8i1)
- RC = &X86::VK8RegClass;
- else if (RegVT == MVT::v16i1)
- RC = &X86::VK16RegClass;
- else if (RegVT == MVT::v32i1)
- RC = &X86::VK32RegClass;
- else if (RegVT == MVT::v64i1)
- RC = &X86::VK64RegClass;
- else
- llvm_unreachable("Unknown argument type!");
+ if (VA.needsCustom()) {
+ assert(
+ VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+
+ // v64i1 values, in regcall calling convention, that are
+ // compiled to 32 bit arch, are splited up into two registers.
+ ArgValue =
+ getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
+ } else {
+ const TargetRegisterClass *RC;
+ if (RegVT == MVT::i32)
+ RC = &X86::GR32RegClass;
+ else if (Is64Bit && RegVT == MVT::i64)
+ RC = &X86::GR64RegClass;
+ else if (RegVT == MVT::f32)
+ RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
+ else if (RegVT == MVT::f64)
+ RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
+ else if (RegVT == MVT::f80)
+ RC = &X86::RFP80RegClass;
+ else if (RegVT == MVT::f128)
+ RC = &X86::FR128RegClass;
+ else if (RegVT.is512BitVector())
+ RC = &X86::VR512RegClass;
+ else if (RegVT.is256BitVector())
+ RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
+ else if (RegVT.is128BitVector())
+ RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
+ else if (RegVT == MVT::x86mmx)
+ RC = &X86::VR64RegClass;
+ else if (RegVT == MVT::i1)
+ RC = &X86::VK1RegClass;
+ else if (RegVT == MVT::v8i1)
+ RC = &X86::VK8RegClass;
+ else if (RegVT == MVT::v16i1)
+ RC = &X86::VK16RegClass;
+ else if (RegVT == MVT::v32i1)
+ RC = &X86::VK32RegClass;
+ else if (RegVT == MVT::v64i1)
+ RC = &X86::VK64RegClass;
+ else
+ llvm_unreachable("Unknown argument type!");
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
- ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+ }
// If this is an 8 or 16-bit value, it is really passed promoted to 32
// bits. Insert an assert[sz]ext to capture this, then truncate to the
@@ -2665,12 +2911,19 @@ SDValue X86TargetLowering::LowerFormalArguments(
// Handle MMX values passed in XMM regs.
if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
- else
+ else if (VA.getValVT().isVector() &&
+ VA.getValVT().getScalarType() == MVT::i1 &&
+ ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+ (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+ // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
+ ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
+ } else
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
}
} else {
assert(VA.isMemLoc());
- ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
+ ArgValue =
+ LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
}
// If value is passed via pointer - do a load.
@@ -2681,7 +2934,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
InVals.push_back(ArgValue);
}
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
// Swift calling convention does not require we copy the sret argument
// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
if (CallConv == CallingConv::Swift)
@@ -2691,14 +2944,14 @@ SDValue X86TargetLowering::LowerFormalArguments(
// sret argument into %rax/%eax (depending on ABI) for the return. Save
// the argument into a virtual register so that we can access it from the
// return points.
- if (Ins[i].Flags.isSRet()) {
+ if (Ins[I].Flags.isSRet()) {
unsigned Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
MVT PtrTy = getPointerTy(DAG.getDataLayout());
Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
FuncInfo->setSRetReturnReg(Reg);
}
- SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
break;
}
@@ -2713,11 +2966,10 @@ SDValue X86TargetLowering::LowerFormalArguments(
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start. We
// can skip this if there are no va_start calls.
- if (MFI->hasVAStart() &&
+ if (MFI.hasVAStart() &&
(Is64Bit || (CallConv != CallingConv::X86_FastCall &&
CallConv != CallingConv::X86_ThisCall))) {
- FuncInfo->setVarArgsFrameIndex(
- MFI->CreateFixedObject(1, StackSize, true));
+ FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
}
// Figure out if XMM registers are in use.
@@ -2727,7 +2979,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// 64-bit calling conventions support varargs and register parameters, so we
// have to do extra work to spill them in the prologue.
- if (Is64Bit && isVarArg && MFI->hasVAStart()) {
+ if (Is64Bit && isVarArg && MFI.hasVAStart()) {
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
@@ -2760,7 +3012,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
FuncInfo->setRegSaveFrameIndex(
- MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+ MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
// Fixup to set vararg frame on shadow area (4 x i64).
if (NumIntRegs < 4)
FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
@@ -2770,7 +3022,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// they may be loaded by dereferencing the result of va_next.
FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
- FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+ FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
}
@@ -2810,7 +3062,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
}
- if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
+ if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
// Find the largest legal vector type.
MVT VecVT = MVT::Other;
// FIXME: Only some x86_32 calling conventions support AVX512.
@@ -2889,7 +3141,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
// space to accommodate holding this slot at the correct offset).
- int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+ int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
}
}
@@ -2938,7 +3190,7 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
int NewReturnAddrFI =
- MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+ MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
false);
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
@@ -3029,11 +3281,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
- // Allocate shadow area for Win64
+ // Allocate shadow area for Win64.
if (IsWin64)
CCInfo.AllocateStack(32, 8);
- CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+ CCInfo.AnalyzeArguments(Outs, CC_X86);
+
+ // In vectorcall calling convention a second pass is required for the HVA
+ // types.
+ if (CallingConv::X86_VectorCall == CallConv) {
+ CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
+ }
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
@@ -3088,18 +3346,25 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
+ // The next loop assumes that the locations are in the same order of the
+ // input arguments.
+ if (!isSortedByValueNo(ArgLocs))
+ llvm_unreachable("Argument Location list must be sorted before lowering");
+
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++OutIndex) {
+ assert(OutIndex < Outs.size() && "Invalid Out index");
// Skip inalloca arguments, they have already been written.
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
if (Flags.isInAlloca())
continue;
- CCValAssign &VA = ArgLocs[i];
+ CCValAssign &VA = ArgLocs[I];
EVT RegVT = VA.getLocVT();
- SDValue Arg = OutVals[i];
+ SDValue Arg = OutVals[OutIndex];
bool isByVal = Flags.isByVal();
// Promote the value if needed.
@@ -3115,7 +3380,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
case CCValAssign::AExt:
if (Arg.getValueType().isVector() &&
Arg.getValueType().getVectorElementType() == MVT::i1)
- Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+ Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
else if (RegVT.is128BitVector()) {
// Special case: passing MMX values in XMM registers.
Arg = DAG.getBitcast(MVT::i64, Arg);
@@ -3139,7 +3404,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
}
- if (VA.isRegLoc()) {
+ if (VA.needsCustom()) {
+ assert(VA.getValVT() == MVT::v64i1 &&
+ "Currently the only custom case is when we split v64i1 to 2 regs");
+ // Split v64i1 value into two registers
+ Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
+ Subtarget);
+ } else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
@@ -3239,20 +3510,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
- if (VA.isRegLoc())
+ for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
+ ++I, ++OutsIndex) {
+ CCValAssign &VA = ArgLocs[I];
+
+ if (VA.isRegLoc()) {
+ if (VA.needsCustom()) {
+ assert((CallConv == CallingConv::X86_RegCall) &&
+ "Expecting custome case only in regcall calling convention");
+ // This means that we are in special case where one argument was
+ // passed through two register locations - Skip the next location
+ ++I;
+ }
+
continue;
+ }
+
assert(VA.isMemLoc());
- SDValue Arg = OutVals[i];
- ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ SDValue Arg = OutVals[OutsIndex];
+ ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
// Skip inalloca arguments. They don't require any work.
if (Flags.isInAlloca())
continue;
// Create frame index.
int32_t Offset = VA.getLocMemOffset()+FPDiff;
uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
- FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+ FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
if (Flags.isByVal()) {
@@ -3391,7 +3674,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
- MF.getFrameInfo()->setHasTailCall();
+ MF.getFrameInfo().setHasTailCall();
return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
}
@@ -3493,9 +3776,9 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
/// same position (relatively) of the caller's incoming argument stack.
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
- MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+ MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
const X86InstrInfo *TII, const CCValAssign &VA) {
- unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+ unsigned Bytes = Arg.getValueSizeInBits() / 8;
for (;;) {
// Look through nodes that don't alter the bits of the incoming value.
@@ -3558,22 +3841,22 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
return false;
assert(FI != INT_MAX);
- if (!MFI->isFixedObjectIndex(FI))
+ if (!MFI.isFixedObjectIndex(FI))
return false;
- if (Offset != MFI->getObjectOffset(FI))
+ if (Offset != MFI.getObjectOffset(FI))
return false;
- if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
+ if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
// If the argument location is wider than the argument type, check that any
// extension flags match.
- if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
- Flags.isSExt() != MFI->isObjectSExt(FI)) {
+ if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
+ Flags.isSExt() != MFI.isObjectSExt(FI)) {
return false;
}
}
- return Bytes == MFI->getObjectSize(FI);
+ return Bytes == MFI.getObjectSize(FI);
}
/// Check whether the call is eligible for tail call optimization. Targets
@@ -3700,7 +3983,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
if (CCInfo.getNextStackOffset()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -3787,6 +4070,14 @@ static bool MayFoldIntoStore(SDValue Op) {
return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
}
+static bool MayFoldIntoZeroExtend(SDValue Op) {
+ if (Op.hasOneUse()) {
+ unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
+ return (ISD::ZERO_EXTEND == Opcode);
+ }
+ return false;
+}
+
static bool isTargetShuffle(unsigned Opcode) {
switch(Opcode) {
default: return false;
@@ -3821,6 +4112,7 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
+ case X86ISD::VPERMIV3:
case X86ISD::VZEXT_MOVL:
return true;
}
@@ -3829,41 +4121,18 @@ static bool isTargetShuffle(unsigned Opcode) {
static bool isTargetShuffleVariableMask(unsigned Opcode) {
switch (Opcode) {
default: return false;
+ // Target Shuffles.
case X86ISD::PSHUFB:
case X86ISD::VPERMILPV:
+ case X86ISD::VPERMIL2:
+ case X86ISD::VPPERM:
+ case X86ISD::VPERMV:
+ case X86ISD::VPERMV3:
+ case X86ISD::VPERMIV3:
+ return true;
+ // 'Faux' Target Shuffles.
+ case ISD::AND:
return true;
- }
-}
-
-static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
- SDValue V1, unsigned TargetMask,
- SelectionDAG &DAG) {
- switch(Opc) {
- default: llvm_unreachable("Unknown x86 shuffle node");
- case X86ISD::PSHUFD:
- case X86ISD::PSHUFHW:
- case X86ISD::PSHUFLW:
- case X86ISD::VPERMILPI:
- case X86ISD::VPERMI:
- return DAG.getNode(Opc, dl, VT, V1,
- DAG.getConstant(TargetMask, dl, MVT::i8));
- }
-}
-
-static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
- SDValue V1, SDValue V2, SelectionDAG &DAG) {
- switch(Opc) {
- default: llvm_unreachable("Unknown x86 shuffle node");
- case X86ISD::MOVLHPS:
- case X86ISD::MOVLHPD:
- case X86ISD::MOVHLPS:
- case X86ISD::MOVLPS:
- case X86ISD::MOVLPD:
- case X86ISD::MOVSS:
- case X86ISD::MOVSD:
- case X86ISD::UNPCKL:
- case X86ISD::UNPCKH:
- return DAG.getNode(Opc, dl, VT, V1, V2);
}
}
@@ -3876,9 +4145,9 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
if (ReturnAddrIndex == 0) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
- ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
- -(int64_t)SlotSize,
- false);
+ ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
+ -(int64_t)SlotSize,
+ false);
FuncInfo->setRAIndex(ReturnAddrIndex);
}
@@ -3974,7 +4243,7 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
/// condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
-static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
+static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
bool isFP, SDValue &LHS, SDValue &RHS,
SelectionDAG &DAG) {
if (!isFP) {
@@ -4175,6 +4444,10 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasLZCNT();
}
+bool X86TargetLowering::isCtlzFast() const {
+ return Subtarget.hasFastLZCNT();
+}
+
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
if (!Subtarget.hasBMI())
return false;
@@ -4187,11 +4460,21 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
return true;
}
+/// Val is the undef sentinel value or equal to the specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+ return ((Val == SM_SentinelUndef) || (Val == CmpVal));
+}
+
+/// Val is either the undef or zero sentinel value.
+static bool isUndefOrZero(int Val) {
+ return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
+}
+
/// Return true if every element in Mask, beginning
-/// from position Pos and ending in Pos+Size is undef.
+/// from position Pos and ending in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
- if (0 <= Mask[i])
+ if (Mask[i] != SM_SentinelUndef)
return false;
return true;
}
@@ -4199,7 +4482,7 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
/// Return true if Val is undef or if its value falls within the
/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
- return (Val < 0) || (Val >= Low && Val < Hi);
+ return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
}
/// Return true if every element in Mask is undef or if its value
@@ -4212,14 +4495,19 @@ static bool isUndefOrInRange(ArrayRef<int> Mask,
return true;
}
-/// Val is either less than zero (undef) or equal to the specified value.
-static bool isUndefOrEqual(int Val, int CmpVal) {
- return (Val < 0 || Val == CmpVal);
+/// Return true if Val is undef, zero or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
+ return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
}
-/// Val is either the undef or zero sentinel value.
-static bool isUndefOrZero(int Val) {
- return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
+/// Return true if every element in Mask is undef, zero or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ for (int M : Mask)
+ if (!isUndefOrZeroOrInRange(M, Low, Hi))
+ return false;
+ return true;
}
/// Return true if every element in Mask, beginning
@@ -4244,6 +4532,100 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
return true;
}
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef or is zero.
+static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+ unsigned Size) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+ if (!isUndefOrZero(Mask[i]))
+ return false;
+ return true;
+}
+
+/// \brief Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ SmallVectorImpl<int> &WidenedMask) {
+ WidenedMask.assign(Mask.size() / 2, 0);
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ // If both elements are undef, its trivial.
+ if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+ WidenedMask[i / 2] = SM_SentinelUndef;
+ continue;
+ }
+
+ // Check for an undef mask and a mask value properly aligned to fit with
+ // a pair of values. If we find such a case, use the non-undef mask's value.
+ if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
+ Mask[i + 1] % 2 == 1) {
+ WidenedMask[i / 2] = Mask[i + 1] / 2;
+ continue;
+ }
+ if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+ WidenedMask[i / 2] = Mask[i] / 2;
+ continue;
+ }
+
+ // When zeroing, we need to spread the zeroing across both lanes to widen.
+ if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
+ if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
+ (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+ WidenedMask[i / 2] = SM_SentinelZero;
+ continue;
+ }
+ return false;
+ }
+
+ // Finally check if the two mask values are adjacent and aligned with
+ // a pair.
+ if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
+ Mask[i] + 1 == Mask[i + 1]) {
+ WidenedMask[i / 2] = Mask[i] / 2;
+ continue;
+ }
+
+ // Otherwise we can't safely widen the elements used in this shuffle.
+ return false;
+ }
+ assert(WidenedMask.size() == Mask.size() / 2 &&
+ "Incorrect size of mask after widening the elements!");
+
+ return true;
+}
+
+/// Helper function to scale a shuffle or target shuffle mask, replacing each
+/// mask index with the scaled sequential indices for an equivalent narrowed
+/// mask. This is the reverse process to canWidenShuffleElements, but can always
+/// succeed.
+static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &ScaledMask) {
+ assert(0 < Scale && "Unexpected scaling factor");
+ int NumElts = Mask.size();
+ ScaledMask.assign(NumElts * Scale, -1);
+
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+
+ // Repeat sentinel values in every mask element.
+ if (M < 0) {
+ for (int s = 0; s != Scale; ++s)
+ ScaledMask[(Scale * i) + s] = M;
+ continue;
+ }
+
+ // Scale mask element and increment across each mask element.
+ for (int s = 0; s != Scale; ++s)
+ ScaledMask[(Scale * i) + s] = (Scale * M) + s;
+ }
+}
+
/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
/// extract that is suitable for instruction that extract 128 or 256 bit vectors
static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
@@ -4256,7 +4638,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
MVT VT = N->getSimpleValueType(0);
- unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+ unsigned ElSize = VT.getScalarSizeInBits();
bool Result = (Index * ElSize) % vecWidth == 0;
return Result;
@@ -4274,7 +4656,7 @@ static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
MVT VT = N->getSimpleValueType(0);
- unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+ unsigned ElSize = VT.getScalarSizeInBits();
bool Result = (Index * ElSize) % vecWidth == 0;
return Result;
@@ -4388,6 +4770,46 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
return ConstsNode;
}
+static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
+ MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+ assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
+ SmallVector<SDValue, 32> Ops;
+ bool Split = false;
+
+ MVT ConstVecVT = VT;
+ unsigned NumElts = VT.getVectorNumElements();
+ bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+ if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+ ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+ Split = true;
+ }
+
+ MVT EltVT = ConstVecVT.getVectorElementType();
+ for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+ if (Undefs[i]) {
+ Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
+ continue;
+ }
+ const APInt &V = Bits[i];
+ assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
+ if (Split) {
+ Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
+ Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
+ } else if (EltVT == MVT::f32) {
+ APFloat FV(APFloat::IEEEsingle(), V);
+ Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+ } else if (EltVT == MVT::f64) {
+ APFloat FV(APFloat::IEEEdouble(), V);
+ Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+ } else {
+ Ops.push_back(DAG.getConstant(V, dl, EltVT));
+ }
+ }
+
+ SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
+ return DAG.getBitcast(VT, ConstsNode);
+}
+
/// Returns a vector of specified type with all zero elements.
static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
SelectionDAG &DAG, const SDLoc &dl) {
@@ -4416,8 +4838,6 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
const SDLoc &dl, unsigned vectorWidth) {
- assert((vectorWidth == 128 || vectorWidth == 256) &&
- "Unsupported vector width");
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
unsigned Factor = VT.getSizeInBits()/vectorWidth;
@@ -4438,8 +4858,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getNode(ISD::BUILD_VECTOR,
- dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+ makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
@@ -4694,29 +5114,35 @@ static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
return DAG.getBitcast(VT, Vec);
}
+/// Generate unpacklo/unpackhi shuffle mask.
+static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+ bool Unary) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+
+ for (int i = 0; i < NumElts; ++i) {
+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+ int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+ Pos += (Unary ? 0 : NumElts * (i % 2));
+ Pos += (Lo ? 0 : NumEltsInLane / 2);
+ Mask.push_back(Pos);
+ }
+}
+
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
- assert(VT.is128BitVector() && "Expected a 128-bit vector type");
- unsigned NumElems = VT.getVectorNumElements();
- SmallVector<int, 8> Mask(NumElems);
- for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
- Mask[i * 2] = i;
- Mask[i * 2 + 1] = i + NumElems;
- }
+ SmallVector<int, 8> Mask;
+ createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
/// Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
- assert(VT.is128BitVector() && "Expected a 128-bit vector type");
- unsigned NumElems = VT.getVectorNumElements();
- SmallVector<int, 8> Mask(NumElems);
- for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
- Mask[i * 2] = i + Half;
- Mask[i * 2 + 1] = i + NumElems + Half;
- }
+ SmallVector<int, 8> Mask;
+ createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
}
@@ -4745,6 +5171,135 @@ static SDValue peekThroughBitcasts(SDValue V) {
return V;
}
+static SDValue peekThroughOneUseBitcasts(SDValue V) {
+ while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
+ V.getOperand(0).hasOneUse())
+ V = V.getOperand(0);
+ return V;
+}
+
+static const Constant *getTargetConstantFromNode(SDValue Op) {
+ Op = peekThroughBitcasts(Op);
+
+ auto *Load = dyn_cast<LoadSDNode>(Op);
+ if (!Load)
+ return nullptr;
+
+ SDValue Ptr = Load->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!CNode || CNode->isMachineConstantPoolEntry())
+ return nullptr;
+
+ return dyn_cast<Constant>(CNode->getConstVal());
+}
+
+// Extract raw constant bits from constant pools.
+static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
+ SmallBitVector &UndefElts,
+ SmallVectorImpl<APInt> &EltBits) {
+ assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+ assert(EltBits.empty() && "Expected an empty EltBits vector");
+
+ Op = peekThroughBitcasts(Op);
+
+ EVT VT = Op.getValueType();
+ unsigned SizeInBits = VT.getSizeInBits();
+ assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
+ unsigned NumElts = SizeInBits / EltSizeInBits;
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(SizeInBits, 0);
+ APInt MaskBits(SizeInBits, 0);
+
+ // Split the undef/constant single bitset data into the target elements.
+ auto SplitBitData = [&]() {
+ UndefElts = SmallBitVector(NumElts, false);
+ EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
+ UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+
+ // Only treat an element as UNDEF if all bits are UNDEF, otherwise
+ // treat it as zero.
+ if (UndefEltBits.isAllOnesValue()) {
+ UndefElts[i] = true;
+ continue;
+ }
+
+ APInt Bits = MaskBits.lshr(i * EltSizeInBits);
+ Bits = Bits.zextOrTrunc(EltSizeInBits);
+ EltBits[i] = Bits.getZExtValue();
+ }
+ return true;
+ };
+
+ auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
+ APInt &Undefs) {
+ if (!Cst)
+ return false;
+ unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
+ if (isa<UndefValue>(Cst)) {
+ Mask = APInt::getNullValue(SizeInBits);
+ Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
+ return true;
+ }
+ if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
+ Mask = CInt->getValue().zextOrTrunc(SizeInBits);
+ Undefs = APInt::getNullValue(SizeInBits);
+ return true;
+ }
+ if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
+ Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
+ Undefs = APInt::getNullValue(SizeInBits);
+ return true;
+ }
+ return false;
+ };
+
+ // Extract constant bits from constant pool vector.
+ if (auto *Cst = getTargetConstantFromNode(Op)) {
+ Type *CstTy = Cst->getType();
+ if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
+ return false;
+
+ unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+ for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
+ APInt Bits, Undefs;
+ if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
+ return false;
+ MaskBits |= Bits.shl(i * CstEltSizeInBits);
+ UndefBits |= Undefs.shl(i * CstEltSizeInBits);
+ }
+
+ return SplitBitData();
+ }
+
+ // Extract constant bits from a broadcasted constant pool scalar.
+ if (Op.getOpcode() == X86ISD::VBROADCAST &&
+ EltSizeInBits <= Op.getScalarValueSizeInBits()) {
+ if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
+ APInt Bits, Undefs;
+ if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
+ unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
+ unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
+ for (unsigned i = 0; i != NumBroadcastElts; ++i) {
+ MaskBits |= Bits.shl(i * NumBroadcastBits);
+ UndefBits |= Undefs.shl(i * NumBroadcastBits);
+ }
+ return SplitBitData();
+ }
+ }
+ }
+
+ return false;
+}
+
+// TODO: Merge more of this with getTargetConstantBitsFromNode.
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
SmallVectorImpl<uint64_t> &RawMask) {
@@ -4752,6 +5307,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
MVT VT = MaskNode.getSimpleValueType();
assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
+ unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
// Split an APInt element into MaskEltSizeInBits sized pieces and
// insert into the shuffle mask.
@@ -4783,17 +5339,20 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
-
- // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
- if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
- return false;
- unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
-
SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
- SplitElementToMask(CN->getAPIntValue());
- RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
- return true;
+ if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
+ RawMask.push_back(CN->getZExtValue());
+ RawMask.append(NumMaskElts - 1, 0);
+ return true;
+ }
+
+ if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
+ unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+ SplitElementToMask(CN->getAPIntValue());
+ RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
+ return true;
+ }
}
return false;
}
@@ -4803,8 +5362,8 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
// We can always decode if the buildvector is all zero constants,
// but can't use isBuildVectorAllZeros as it might contain UNDEFs.
- if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
- RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
+ if (all_of(MaskNode->ops(), X86::isZeroNode)) {
+ RawMask.append(NumMaskElts, 0);
return true;
}
@@ -4824,25 +5383,6 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
return true;
}
-static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
- MaskNode = peekThroughBitcasts(MaskNode);
-
- auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
- if (!MaskLoad)
- return nullptr;
-
- SDValue Ptr = MaskLoad->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper ||
- Ptr->getOpcode() == X86ISD::WrapperRIP)
- Ptr = Ptr->getOperand(0);
-
- auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
- if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
- return nullptr;
-
- return dyn_cast<Constant>(MaskCP->getConstVal());
-}
-
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
@@ -4896,6 +5436,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(0));
break;
case X86ISD::VSHLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
@@ -4947,7 +5490,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodeVPERMILPMask(VT, RawMask, Mask);
break;
}
- if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodeVPERMILPMask(C, MaskEltSize, Mask);
break;
}
@@ -4961,7 +5504,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodePSHUFBMask(RawMask, Mask);
break;
}
- if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodePSHUFBMask(C, Mask);
break;
}
@@ -5010,7 +5553,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
break;
}
- if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
break;
}
@@ -5025,7 +5568,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodeVPPERMMask(RawMask, Mask);
break;
}
- if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
DecodeVPPERMMask(C, Mask);
break;
}
@@ -5042,8 +5585,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodeVPERMVMask(RawMask, Mask);
break;
}
- if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
- DecodeVPERMVMask(C, VT, Mask);
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPERMVMask(C, MaskEltSize, Mask);
break;
}
return false;
@@ -5054,8 +5597,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
Ops.push_back(N->getOperand(0));
Ops.push_back(N->getOperand(2));
SDValue MaskNode = N->getOperand(1);
- if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
- DecodeVPERMV3Mask(C, VT, Mask);
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+ break;
+ }
+ return false;
+ }
+ case X86ISD::VPERMIV3: {
+ IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
+ // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(2));
+ SDValue MaskNode = N->getOperand(0);
+ unsigned MaskEltSize = VT.getScalarSizeInBits();
+ if (auto *C = getTargetConstantFromNode(MaskNode)) {
+ DecodeVPERMV3Mask(C, MaskEltSize, Mask);
break;
}
return false;
@@ -5069,7 +5626,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
// Check if we're getting a shuffle mask with zero'd elements.
if (!AllowSentinelZero)
- if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+ if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
return false;
// If we have a fake unary shuffle, the shuffle mask is spread across two
@@ -5101,8 +5658,9 @@ static bool setTargetShuffleZeroElements(SDValue N,
bool IsUnary;
if (!isTargetShuffle(N.getOpcode()))
return false;
- if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
- Mask, IsUnary))
+
+ MVT VT = N.getSimpleValueType();
+ if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
return false;
SDValue V1 = Ops[0];
@@ -5164,9 +5722,94 @@ static bool setTargetShuffleZeroElements(SDValue N,
}
}
+ assert(VT.getVectorNumElements() == Mask.size() &&
+ "Different mask size from vector size!");
return true;
}
+// Attempt to decode ops that could be represented as a shuffle mask.
+// The decoded shuffle mask may contain a different number of elements to the
+// destination value type.
+static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops) {
+ Mask.clear();
+ Ops.clear();
+
+ MVT VT = N.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumSizeInBits = VT.getSizeInBits();
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
+ "Expected byte aligned value types");
+
+ unsigned Opcode = N.getOpcode();
+ switch (Opcode) {
+ case ISD::AND: {
+ // Attempt to decode as a per-byte mask.
+ SmallBitVector UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
+ return false;
+ for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
+ if (UndefElts[i]) {
+ Mask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t ByteBits = EltBits[i].getZExtValue();
+ if (ByteBits != 0 && ByteBits != 255)
+ return false;
+ Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
+ }
+ Ops.push_back(N.getOperand(0));
+ return true;
+ }
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: {
+ uint64_t ShiftVal = N.getConstantOperandVal(1);
+ // Out of range bit shifts are guaranteed to be zero.
+ if (NumBitsPerElt <= ShiftVal) {
+ Mask.append(NumElts, SM_SentinelZero);
+ return true;
+ }
+
+ // We can only decode 'whole byte' bit shifts as shuffles.
+ if ((ShiftVal % 8) != 0)
+ break;
+
+ uint64_t ByteShift = ShiftVal / 8;
+ unsigned NumBytes = NumSizeInBits / 8;
+ unsigned NumBytesPerElt = NumBitsPerElt / 8;
+ Ops.push_back(N.getOperand(0));
+
+ // Clear mask to all zeros and insert the shifted byte indices.
+ Mask.append(NumBytes, SM_SentinelZero);
+
+ if (X86ISD::VSHLI == Opcode) {
+ for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+ Mask[i + j] = i + j - ByteShift;
+ } else {
+ for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+ Mask[i + j - ByteShift] = i + j;
+ }
+ return true;
+ }
+ case X86ISD::VZEXT: {
+ // TODO - add support for VPMOVZX with smaller input vector types.
+ SDValue Src = N.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ if (NumSizeInBits != SrcVT.getSizeInBits())
+ break;
+ DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
+ Ops.push_back(Src);
+ return true;
+ }
+ }
+
+ return false;
+}
+
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
/// remaining input indices in case we now have a unary shuffle and adjust the
@@ -5176,14 +5819,14 @@ static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
SmallVectorImpl<int> &Mask) {
SmallVector<SDValue, 2> Ops;
if (!setTargetShuffleZeroElements(Op, Mask, Ops))
- return false;
+ if (!getFauxShuffleMask(Op, Mask, Ops))
+ return false;
int NumElts = Mask.size();
- bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
+ bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
return 0 <= Idx && Idx < NumElts;
});
- bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
- [NumElts](int Idx) { return NumElts <= Idx; });
+ bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
Op0 = Op0InUse ? Ops[0] : SDValue();
Op1 = Op1InUse ? Ops[1] : SDValue();
@@ -5523,15 +6166,15 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
unsigned RequiredAlign = VT.getSizeInBits()/8;
SDValue Chain = LD->getChain();
// Make sure the stack object alignment is at least 16 or 32.
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
- if (MFI->isFixedObjectIndex(FI)) {
+ if (MFI.isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
// If someone *really* cares about this. That's the way to implement it.
return SDValue();
} else {
- MFI->setObjectAlignment(FI, RequiredAlign);
+ MFI.setObjectAlignment(FI, RequiredAlign);
}
}
@@ -5697,11 +6340,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
int LoadSize =
(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
- // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
- if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
+ // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
+ if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
+ (LoadSize == 32 || LoadSize == 64) &&
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
- MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
- MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
+ MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
+ : MVT::getIntegerVT(LoadSize);
+ MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
@@ -5728,31 +6373,53 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
- if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
- ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
- MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
- MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
- if (TLI.isTypeLegal(VecVT)) {
- SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
- : DAG.getBitcast(VecSVT, EltBase);
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
- return DAG.getBitcast(VT, V);
- }
+ return SDValue();
+}
+
+static Constant *getConstantVector(MVT VT, APInt SplatValue,
+ unsigned SplatBitSize, LLVMContext &C) {
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ unsigned NumElm = SplatBitSize / ScalarSize;
+
+ SmallVector<Constant *, 32> ConstantVec;
+ for (unsigned i = 0; i < NumElm; i++) {
+ APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+ Constant *Const;
+ if (VT.isFloatingPoint()) {
+ assert((ScalarSize == 32 || ScalarSize == 64) &&
+ "Unsupported floating point scalar size");
+ if (ScalarSize == 32)
+ Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
+ else
+ Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
+ } else
+ Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
+ ConstantVec.push_back(Const);
}
+ return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
+}
- return SDValue();
+static bool isUseOfShuffle(SDNode *N) {
+ for (auto *U : N->uses()) {
+ if (isTargetShuffle(U->getOpcode()))
+ return true;
+ if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
+ return isUseOfShuffle(U);
+ }
+ return false;
}
/// Attempt to use the vbroadcast instruction to generate a splat value for the
/// following cases:
-/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
+/// 1. A splat BUILD_VECTOR which uses:
+/// a. A single scalar load, or a constant.
+/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
/// a scalar load, or a constant.
+///
/// The VBROADCAST node is returned when a pattern is found,
/// or SDValue() otherwise.
-static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// VBROADCAST requires AVX.
// TODO: Splats could be generated for non-AVX CPUs using SSE
@@ -5760,81 +6427,103 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
if (!Subtarget.hasAVX())
return SDValue();
- MVT VT = Op.getSimpleValueType();
- SDLoc dl(Op);
+ MVT VT = BVOp->getSimpleValueType(0);
+ SDLoc dl(BVOp);
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
- SDValue Ld;
- bool ConstSplatVal;
-
- switch (Op.getOpcode()) {
- default:
- // Unknown pattern found.
- return SDValue();
-
- case ISD::BUILD_VECTOR: {
- auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
- BitVector UndefElements;
- SDValue Splat = BVOp->getSplatValue(&UndefElements);
-
- // We need a splat of a single value to use broadcast, and it doesn't
- // make any sense if the value is only in one element of the vector.
- if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
+ BitVector UndefElements;
+ SDValue Ld = BVOp->getSplatValue(&UndefElements);
+
+ // We need a splat of a single value to use broadcast, and it doesn't
+ // make any sense if the value is only in one element of the vector.
+ if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
+ APInt SplatValue, Undef;
+ unsigned SplatBitSize;
+ bool HasUndef;
+ // Check if this is a repeated constant pattern suitable for broadcasting.
+ if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
+ SplatBitSize > VT.getScalarSizeInBits() &&
+ SplatBitSize < VT.getSizeInBits()) {
+ // Avoid replacing with broadcast when it's a use of a shuffle
+ // instruction to preserve the present custom lowering of shuffles.
+ if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
return SDValue();
-
- Ld = Splat;
- ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
- Ld.getOpcode() == ISD::ConstantFP);
-
- // Make sure that all of the users of a non-constant load are from the
- // BUILD_VECTOR node.
- if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
- return SDValue();
- break;
- }
-
- case ISD::VECTOR_SHUFFLE: {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-
- // Shuffles must have a splat mask where the first element is
- // broadcasted.
- if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
- return SDValue();
-
- SDValue Sc = Op.getOperand(0);
- if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
- Sc.getOpcode() != ISD::BUILD_VECTOR) {
-
- if (!Subtarget.hasInt256())
- return SDValue();
-
- // Use the register form of the broadcast instruction available on AVX2.
- if (VT.getSizeInBits() >= 256)
- Sc = extract128BitVector(Sc, 0, DAG, dl);
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+ // replace BUILD_VECTOR with broadcast of the repeated constants.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ LLVMContext *Ctx = DAG.getContext();
+ MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (Subtarget.hasAVX()) {
+ if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
+ !(SplatBitSize == 64 && Subtarget.is32Bit())) {
+ // Splatted value can fit in one INTEGER constant in constant pool.
+ // Load the constant and broadcast it.
+ MVT CVT = MVT::getIntegerVT(SplatBitSize);
+ Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
+ Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
+ SDValue CP = DAG.getConstantPool(C, PVT);
+ unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+ MVT::getVectorVT(CVT, Repeat), Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ } else if (SplatBitSize == 32 || SplatBitSize == 64) {
+ // Splatted value can fit in one FLOAT constant in constant pool.
+ // Load the constant and broadcast it.
+ // AVX have support for 32 and 64 bit broadcast for floats only.
+ // No 64bit integer in 32bit subtarget.
+ MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
+ Constant *C = SplatBitSize == 32
+ ? ConstantFP::get(Type::getFloatTy(*Ctx),
+ SplatValue.bitsToFloat())
+ : ConstantFP::get(Type::getDoubleTy(*Ctx),
+ SplatValue.bitsToDouble());
+ SDValue CP = DAG.getConstantPool(C, PVT);
+ unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+ unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+ Ld = DAG.getLoad(
+ CVT, dl, DAG.getEntryNode(), CP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+ MVT::getVectorVT(CVT, Repeat), Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ } else if (SplatBitSize > 64) {
+ // Load the vector of constants and broadcast it.
+ MVT CVT = VT.getScalarType();
+ Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
+ *Ctx);
+ SDValue VCP = DAG.getConstantPool(VecC, PVT);
+ unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+ unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
+ Ld = DAG.getLoad(
+ MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ Alignment);
+ SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
+ return DAG.getBitcast(VT, Brdcst);
+ }
}
-
- Ld = Sc.getOperand(0);
- ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
- Ld.getOpcode() == ISD::ConstantFP);
-
- // The scalar_to_vector node and the suspected
- // load node must have exactly one user.
- // Constants may have multiple users.
-
- // AVX-512 has register version of the broadcast
- bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
- Ld.getValueType().getSizeInBits() >= 32;
- if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
- !hasRegVer))
- return SDValue();
- break;
}
+ return SDValue();
}
- unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+ bool ConstSplatVal =
+ (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
+
+ // Make sure that all of the users of a non-constant load are from the
+ // BUILD_VECTOR node.
+ if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+ return SDValue();
+
+ unsigned ScalarSize = Ld.getValueSizeInBits();
bool IsGE256 = (VT.getSizeInBits() >= 256);
// When optimizing for size, generate up to 5 extra bytes for a broadcast
@@ -6025,8 +6714,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
}
SDLoc dl(Op);
- MVT VT =
- MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
+ MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
return DAG.getConstant(Immediate, dl, VT);
}
// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
@@ -6273,23 +6961,24 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
}
-/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
-/// node.
-static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+/// Returns true iff \p BV builds a vector with the result equivalent to
+/// the result of ADDSUB operation.
+/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
+static bool isAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1) {
+
MVT VT = BV->getSimpleValueType(0);
if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
- return SDValue();
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
+ (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+ return false;
- SDLoc DL(BV);
unsigned NumElts = VT.getVectorNumElements();
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
- assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
- VT == MVT::v2f64) && "build_vector with an invalid type found!");
-
// Odd-numbered elements in the input build vector are obtained from
// adding two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
@@ -6311,7 +7000,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
// Early exit if we found an unexpected opcode.
if (Opcode != ExpectedOpcode)
- return SDValue();
+ return false;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
@@ -6324,11 +7013,11 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
- return SDValue();
+ return false;
unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
if (I0 != i)
- return SDValue();
+ return false;
// We found a valid add/sub node. Update the information accordingly.
if (i & 1)
@@ -6340,39 +7029,118 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
if (InVec0.isUndef()) {
InVec0 = Op0.getOperand(0);
if (InVec0.getSimpleValueType() != VT)
- return SDValue();
+ return false;
}
if (InVec1.isUndef()) {
InVec1 = Op1.getOperand(0);
if (InVec1.getSimpleValueType() != VT)
- return SDValue();
+ return false;
}
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
if (InVec0 != Op0.getOperand(0)) {
if (ExpectedOpcode == ISD::FSUB)
- return SDValue();
+ return false;
// FADD is commutable. Try to commute the operands
// and then test again.
std::swap(Op0, Op1);
if (InVec0 != Op0.getOperand(0))
- return SDValue();
+ return false;
}
if (InVec1 != Op1.getOperand(0))
- return SDValue();
+ return false;
// Update the pair of expected opcodes.
std::swap(ExpectedOpcode, NextExpectedOpcode);
}
// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
- if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
- return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
+ if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
+ return false;
- return SDValue();
+ Opnd0 = InVec0;
+ Opnd1 = InVec1;
+ return true;
+}
+
+/// Returns true if is possible to fold MUL and an idiom that has already been
+/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
+/// If (and only if) true is returned, the operands of FMADDSUB are written to
+/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
+///
+/// Prior to calling this function it should be known that there is some
+/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
+/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
+/// before replacement of such SDNode with ADDSUB operation. Thus the number
+/// of \p Opnd0 uses is expected to be equal to 2.
+/// For example, this function may be called for the following IR:
+/// %AB = fmul fast <2 x double> %A, %B
+/// %Sub = fsub fast <2 x double> %AB, %C
+/// %Add = fadd fast <2 x double> %AB, %C
+/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
+/// <2 x i32> <i32 0, i32 3>
+/// There is a def for %Addsub here, which potentially can be replaced by
+/// X86ISD::ADDSUB operation:
+/// %Addsub = X86ISD::ADDSUB %AB, %C
+/// and such ADDSUB can further be replaced with FMADDSUB:
+/// %Addsub = FMADDSUB %A, %B, %C.
+///
+/// The main reason why this method is called before the replacement of the
+/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
+/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
+/// FMADDSUB is.
+static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
+ if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
+ !Subtarget.hasAnyFMA())
+ return false;
+
+ // FIXME: These checks must match the similar ones in
+ // DAGCombiner::visitFADDForFMACombine. It would be good to have one
+ // function that would answer if it is Ok to fuse MUL + ADD to FMADD
+ // or MUL + ADDSUB to FMADDSUB.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ bool AllowFusion =
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+ if (!AllowFusion)
+ return false;
+
+ Opnd2 = Opnd1;
+ Opnd1 = Opnd0.getOperand(1);
+ Opnd0 = Opnd0.getOperand(0);
+
+ return true;
+}
+
+/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
+/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
+static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Opnd0, Opnd1;
+ if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
+ return SDValue();
+
+ MVT VT = BV->getSimpleValueType(0);
+ SDLoc DL(BV);
+
+ // Try to generate X86ISD::FMADDSUB node here.
+ SDValue Opnd2;
+ if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+ return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+
+ // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+ // the ADDSUB idiom has been successfully recognized. There are no known
+ // X86 targets with 512-bit ADDSUB instructions!
+ // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
+ // recognition.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
@@ -6510,17 +7278,18 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
/// NOTE: Its not in our interest to start make a general purpose vectorizer
/// from this, but enough scalar bit operations are created from the later
/// legalization + scalarization stages to need basic support.
-static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
+static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+ SelectionDAG &DAG) {
SDLoc DL(Op);
- MVT VT = Op.getSimpleValueType();
+ MVT VT = Op->getSimpleValueType(0);
unsigned NumElems = VT.getVectorNumElements();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Check that all elements have the same opcode.
// TODO: Should we allow UNDEFS and if so how many?
- unsigned Opcode = Op.getOperand(0).getOpcode();
+ unsigned Opcode = Op->getOperand(0).getOpcode();
for (unsigned i = 1; i < NumElems; ++i)
- if (Opcode != Op.getOperand(i).getOpcode())
+ if (Opcode != Op->getOperand(i).getOpcode())
return SDValue();
// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
@@ -6600,13 +7369,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return VectorConstant;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
- if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+ if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
return HorizontalOp;
- if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
+ if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
return Broadcast;
- if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
return BitOp;
unsigned EVTBits = ExtVT.getSizeInBits();
@@ -6673,12 +7442,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
- if (VT.is512BitVector()) {
- SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
- Item, DAG.getIntPtrConstant(0, dl));
- }
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ assert((VT.is128BitVector() || VT.is256BitVector() ||
+ VT.is512BitVector()) &&
"Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
@@ -7088,6 +7853,7 @@ static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
RepeatedMask.assign(LaneSize, -1);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
+ assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
if (Mask[i] < 0)
continue;
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
@@ -7122,26 +7888,40 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
}
-static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
- SmallVectorImpl<int> &ScaledMask) {
- assert(0 < Scale && "Unexpected scaling factor");
- int NumElts = Mask.size();
- ScaledMask.assign(NumElts * Scale, -1);
-
- for (int i = 0; i != NumElts; ++i) {
- int M = Mask[i];
-
- // Repeat sentinel values in every mask element.
- if (M < 0) {
- for (int s = 0; s != Scale; ++s)
- ScaledMask[(Scale * i) + s] = M;
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ RepeatedMask.assign(LaneSize, SM_SentinelUndef);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
+ if (Mask[i] == SM_SentinelUndef)
+ continue;
+ if (Mask[i] == SM_SentinelZero) {
+ if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
+ return false;
+ RepeatedMask[i % LaneSize] = SM_SentinelZero;
continue;
}
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
- // Scale mask element and increment across each mask element.
- for (int s = 0; s != Scale; ++s)
- ScaledMask[(Scale * i) + s] = (Scale * M) + s;
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ // Adjust second vector indices to start at LaneSize instead of Size.
+ int LocalM =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
+ if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] = LocalM;
+ else if (RepeatedMask[i % LaneSize] != LocalM)
+ // Found a mismatch with the repeated mask.
+ return false;
}
+ return true;
}
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
@@ -7251,7 +8031,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
- int VectorSizeInBits = V1.getValueType().getSizeInBits();
+ int VectorSizeInBits = V1.getValueSizeInBits();
int ScalarSizeInBits = VectorSizeInBits / Mask.size();
assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
@@ -7309,11 +8089,42 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable;
}
-/// Try to lower a shuffle with a single PSHUFB of V1.
-/// This is only possible if V2 is unused (at all, or only for zero elements).
+// The Shuffle result is as follow:
+// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
+// Each Zeroable's element correspond to a particular Mask's element.
+// As described in computeZeroableShuffleElements function.
+//
+// The function looks for a sub-mask that the nonzero elements are in
+// increasing order. If such sub-mask exist. The function returns true.
+static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
+ ArrayRef<int> Mask,const EVT &VectorType,
+ bool &IsZeroSideLeft) {
+ int NextElement = -1;
+ // Check if the Mask's nonzero elements are in increasing order.
+ for (int i = 0, e = Zeroable.size(); i < e; i++) {
+ // Checks if the mask's zeros elements are built from only zeros.
+ if (Mask[i] == -1)
+ return false;
+ if (Zeroable[i])
+ continue;
+ // Find the lowest non zero element
+ if (NextElement == -1) {
+ NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
+ IsZeroSideLeft = NextElement != 0;
+ }
+ // Exit if the mask's non zero elements are not in increasing order.
+ if (NextElement != Mask[i])
+ return false;
+ NextElement++;
+ }
+ return true;
+}
+
+/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2,
+ const SmallBitVector &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
@@ -7325,12 +8136,11 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
(Subtarget.hasAVX2() && VT.is256BitVector()) ||
(Subtarget.hasBWI() && VT.is512BitVector()));
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
// Sign bit set in i8 mask means zero element.
SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
+ SDValue V;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / NumEltBytes];
if (M < 0) {
@@ -7341,9 +8151,13 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
PSHUFBMask[i] = ZeroMask;
continue;
}
- // Only allow V1.
- if (M >= Size)
+
+ // We can only use a single input of V1 or V2.
+ SDValue SrcV = (M >= Size ? V2 : V1);
+ if (V && V != SrcV)
return SDValue();
+ V = SrcV;
+ M %= Size;
// PSHUFB can't cross lanes, ensure this doesn't happen.
if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
@@ -7353,33 +8167,66 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
M = M * NumEltBytes + (i % NumEltBytes);
PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
}
+ assert(V && "Failed to find a source input");
MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
+ VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
}
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ const SDLoc &dl);
+
+// Function convertBitVectorToUnsigned - The function gets SmallBitVector
+// as argument and convert him to unsigned.
+// The output of the function is not(zeroable)
+static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
+ unsigned convertBit = 0;
+ for (int i = 0, e = Zeroable.size(); i < e; i++)
+ convertBit |= !(Zeroable[i]) << i;
+ return convertBit;
+}
+
+// X86 has dedicated shuffle that can be lowered to VEXPAND
+static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
+ const SmallBitVector &Zeroable,
+ ArrayRef<int> Mask, SDValue &V1,
+ SDValue &V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsLeftZeroSide = true;
+ if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
+ IsLeftZeroSide))
+ return SDValue();
+ unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
+ unsigned NumElts = VT.getVectorNumElements();
+ assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+ "Unexpected number of vector elements");
+ SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
+ Subtarget, DAG, DL);
+ SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
+ SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
+ return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
+ DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+ ZeroVector);
+}
+
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
- int NumElts = VT.getVectorNumElements();
- int NumEltsInLane = 128 / VT.getScalarSizeInBits();
- SmallVector<int, 8> Unpckl(NumElts);
- SmallVector<int, 8> Unpckh(NumElts);
-
- for (int i = 0; i < NumElts; ++i) {
- unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
- int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
- int HiPos = LoPos + NumEltsInLane / 2;
- Unpckl[i] = LoPos;
- Unpckh[i] = HiPos;
- }
-
+ SmallVector<int, 8> Unpckl;
+ createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+
+ SmallVector<int, 8> Unpckh;
+ createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
@@ -7401,19 +8248,14 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
/// one of the inputs being zeroable.
static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() && "Floating point types are not supported");
MVT EltVT = VT.getVectorElementType();
- int NumEltBits = EltVT.getSizeInBits();
- MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
- SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
- SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
- IntEltVT);
- if (EltVT.isFloatingPoint()) {
- Zero = DAG.getBitcast(EltVT, Zero);
- AllOnes = DAG.getBitcast(EltVT, AllOnes);
- }
+ SDValue Zero = DAG.getConstant(0, DL, EltVT);
+ SDValue AllOnes =
+ DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Zeroable[i])
@@ -7431,10 +8273,7 @@ static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
return SDValue(); // No non-zeroable elements!
SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
- V = DAG.getNode(VT.isFloatingPoint()
- ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
- DL, VT, V, VMask);
- return V;
+ return DAG.getNode(ISD::AND, DL, VT, V, VMask);
}
/// \brief Try to emit a blend instruction for a shuffle using bit math.
@@ -7476,12 +8315,12 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
/// that the shuffle mask is a blend, or convertible into a blend with zero.
static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Original,
+ const SmallBitVector &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
SmallVector<int, 8> Mask(Original.begin(), Original.end());
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
bool ForceV1Zero = false, ForceV2Zero = false;
// Attempt to generate the binary blend mask. If an input is zero then
@@ -7540,7 +8379,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
- // FALLTHROUGH
+ LLVM_FALLTHROUGH;
case MVT::v2i64:
case MVT::v4i32:
// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
@@ -7556,7 +8395,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
DAG.getConstant(BlendMask, DL, MVT::i8)));
}
- // FALLTHROUGH
+ LLVM_FALLTHROUGH;
case MVT::v8i16: {
// For integer shuffles we need to expand the mask and cast the inputs to
// v8i16s prior to blending.
@@ -7582,15 +8421,16 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getConstant(BlendMask, DL, MVT::i8));
}
+ LLVM_FALLTHROUGH;
}
- // FALLTHROUGH
case MVT::v16i8:
case MVT::v32i8: {
assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
"256-bit byte-blends require AVX2 support!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
- if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
return Masked;
// Scale the blend by the number of bytes per element.
@@ -7704,32 +8544,12 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
}
-/// \brief Try to lower a vector shuffle as a byte rotation.
-///
-/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
-/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
-/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
-/// try to generically lower a vector shuffle through such an pattern. It
-/// does not check for the profitability of lowering either as PALIGNR or
-/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
-/// This matches shuffle vectors that look like:
-///
-/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+/// \brief Try to lower a vector shuffle as a rotation.
///
-/// Essentially it concatenates V1 and V2, shifts right by some number of
-/// elements, and takes the low elements as the result. Note that while this is
-/// specified as a *right shift* because x86 is little-endian, it is a *left
-/// rotate* of the vector lanes.
-static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
- SDValue V1, SDValue V2,
- ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
-
+/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
+static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
int NumElts = Mask.size();
- int NumLanes = VT.getSizeInBits() / 128;
- int NumLaneElts = NumElts / NumLanes;
// We need to detect various ways of spelling a rotation:
// [11, 12, 13, 14, 15, 0, 1, 2]
@@ -7740,51 +8560,46 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
// [-1, 4, 5, 6, -1, -1, -1, -1]
int Rotation = 0;
SDValue Lo, Hi;
- for (int l = 0; l < NumElts; l += NumLaneElts) {
- for (int i = 0; i < NumLaneElts; ++i) {
- if (Mask[l + i] < 0)
- continue;
-
- // Get the mod-Size index and lane correct it.
- int LaneIdx = (Mask[l + i] % NumElts) - l;
- // Make sure it was in this lane.
- if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
- return SDValue();
+ for (int i = 0; i < NumElts; ++i) {
+ int M = Mask[i];
+ assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
+ "Unexpected mask index.");
+ if (M < 0)
+ continue;
- // Determine where a rotated vector would have started.
- int StartIdx = i - LaneIdx;
- if (StartIdx == 0)
- // The identity rotation isn't interesting, stop.
- return SDValue();
+ // Determine where a rotated vector would have started.
+ int StartIdx = i - (M % NumElts);
+ if (StartIdx == 0)
+ // The identity rotation isn't interesting, stop.
+ return -1;
- // If we found the tail of a vector the rotation must be the missing
- // front. If we found the head of a vector, it must be how much of the
- // head.
- int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the
+ // head.
+ int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
- if (Rotation == 0)
- Rotation = CandidateRotation;
- else if (Rotation != CandidateRotation)
- // The rotations don't match, so we can't match this mask.
- return SDValue();
+ if (Rotation == 0)
+ Rotation = CandidateRotation;
+ else if (Rotation != CandidateRotation)
+ // The rotations don't match, so we can't match this mask.
+ return -1;
- // Compute which value this mask is pointing at.
- SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
-
- // Compute which of the two target values this index should be assigned
- // to. This reflects whether the high elements are remaining or the low
- // elements are remaining.
- SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
-
- // Either set up this value if we've not encountered it before, or check
- // that it remains consistent.
- if (!TargetV)
- TargetV = MaskV;
- else if (TargetV != MaskV)
- // This may be a rotation, but it pulls from the inputs in some
- // unsupported interleaving.
- return SDValue();
- }
+ // Compute which value this mask is pointing at.
+ SDValue MaskV = M < NumElts ? V1 : V2;
+
+ // Compute which of the two target values this index should be assigned
+ // to. This reflects whether the high elements are remaining or the low
+ // elements are remaining.
+ SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+ // Either set up this value if we've not encountered it before, or check
+ // that it remains consistent.
+ if (!TargetV)
+ TargetV = MaskV;
+ else if (TargetV != MaskV)
+ // This may be a rotation, but it pulls from the inputs in some
+ // unsupported interleaving.
+ return -1;
}
// Check that we successfully analyzed the mask, and normalize the results.
@@ -7795,23 +8610,75 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
else if (!Hi)
Hi = Lo;
+ V1 = Lo;
+ V2 = Hi;
+
+ return Rotation;
+}
+
+/// \brief Try to lower a vector shuffle as a byte rotation.
+///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+///
+/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
+ // Don't accept any shuffles with zero elements.
+ if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+ return -1;
+
+ // PALIGNR works on 128-bit lanes.
+ SmallVector<int, 16> RepeatedMask;
+ if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
+ return -1;
+
+ int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
+ if (Rotation <= 0)
+ return -1;
+
+ // PALIGNR rotates bytes, so we need to scale the
+ // rotation based on how many bytes are in the vector lane.
+ int NumElts = RepeatedMask.size();
+ int Scale = 16 / NumElts;
+ return Rotation * Scale;
+}
+
+static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+ SDValue Lo = V1, Hi = V2;
+ int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
+ if (ByteRotation <= 0)
+ return SDValue();
+
// Cast the inputs to i8 vector of correct length to match PALIGNR or
// PSLLDQ/PSRLDQ.
- MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
Lo = DAG.getBitcast(ByteVT, Lo);
Hi = DAG.getBitcast(ByteVT, Hi);
- // The actual rotate instruction rotates bytes, so we need to scale the
- // rotation based on how many bytes are in the vector lane.
- int Scale = 16 / NumLaneElts;
-
// SSSE3 targets can use the palignr instruction.
if (Subtarget.hasSSSE3()) {
assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
"512-bit PALIGNR requires BWI instructions");
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
- DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
+ DAG.getConstant(ByteRotation, DL, MVT::i8)));
}
assert(VT.is128BitVector() &&
@@ -7822,8 +8689,8 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
"SSE2 rotate lowering only needed for v16i8!");
// Default SSE2 implementation
- int LoByteShift = 16 - Rotation * Scale;
- int HiByteShift = Rotation * Scale;
+ int LoByteShift = 16 - ByteRotation;
+ int HiByteShift = ByteRotation;
SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
DAG.getConstant(LoByteShift, DL, MVT::i8));
@@ -7833,6 +8700,37 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
}
+/// \brief Try to lower a vector shuffle as a dword/qword rotation.
+///
+/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
+/// rotation of the concatenation of two vectors; This routine will
+/// try to generically lower a vector shuffle through such an pattern.
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+ "Only 32-bit and 64-bit elements are supported!");
+
+ // 128/256-bit vectors are only supported with VLX.
+ assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
+ && "VLX required for 128/256-bit vectors");
+
+ SDValue Lo = V1, Hi = V2;
+ int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
+ if (Rotation <= 0)
+ return SDValue();
+
+ return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
+ DAG.getConstant(Rotation, DL, MVT::i8));
+}
+
/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -7856,14 +8754,13 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
/// [ 5, 6, 7, zz, zz, zz, zz, zz]
/// [ -1, 5, 6, 7, zz, zz, zz, zz]
/// [ 1, 2, -1, -1, -1, -1, zz, zz]
-static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
+static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+ unsigned ScalarSizeInBits,
+ ArrayRef<int> Mask, int MaskOffset,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget) {
int Size = Mask.size();
- assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+ unsigned SizeInBits = Size * ScalarSizeInBits;
auto CheckZeros = [&](int Shift, int Scale, bool Left) {
for (int i = 0; i < Size; i += Scale)
@@ -7874,37 +8771,30 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
return true;
};
- auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
+ auto MatchShift = [&](int Shift, int Scale, bool Left) {
for (int i = 0; i != Size; i += Scale) {
unsigned Pos = Left ? i + Shift : i;
unsigned Low = Left ? i : i + Shift;
unsigned Len = Scale - Shift;
- if (!isSequentialOrUndefInRange(Mask, Pos, Len,
- Low + (V == V1 ? 0 : Size)))
- return SDValue();
+ if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
+ return -1;
}
- int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
+ int ShiftEltBits = ScalarSizeInBits * Scale;
bool ByteShift = ShiftEltBits > 64;
- unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
- : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
- int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
+ Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+ : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+ int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
// Normalize the scale for byte shifts to still produce an i64 element
// type.
Scale = ByteShift ? Scale / 2 : Scale;
// We need to round trip through the appropriate type for the shift.
- MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
- MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
- : MVT::getVectorVT(ShiftSVT, Size / Scale);
- assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
- "Illegal integer vector type");
- V = DAG.getBitcast(ShiftVT, V);
-
- V = DAG.getNode(OpCode, DL, ShiftVT, V,
- DAG.getConstant(ShiftAmt, DL, MVT::i8));
- return DAG.getBitcast(VT, V);
+ MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
+ ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
+ : MVT::getVectorVT(ShiftSVT, Size / Scale);
+ return (int)ShiftAmt;
};
// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
@@ -7913,29 +8803,64 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
// their width within the elements of the larger integer vector. Test each
// multiple to see if we can find a match with the moved element indices
// and that the shifted in elements are all zeroable.
- unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
- for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
+ unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
+ for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
for (int Shift = 1; Shift != Scale; ++Shift)
for (bool Left : {true, false})
- if (CheckZeros(Shift, Scale, Left))
- for (SDValue V : {V1, V2})
- if (SDValue Match = MatchShift(Shift, Scale, Left, V))
- return Match;
+ if (CheckZeros(Shift, Scale, Left)) {
+ int ShiftAmt = MatchShift(Shift, Scale, Left);
+ if (0 < ShiftAmt)
+ return ShiftAmt;
+ }
// no match
- return SDValue();
+ return -1;
+}
+
+static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ int Size = Mask.size();
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ MVT ShiftVT;
+ SDValue V = V1;
+ unsigned Opcode;
+
+ // Try to match shuffle against V1 shift.
+ int ShiftAmt = matchVectorShuffleAsShift(
+ ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
+
+ // If V1 failed, try to match shuffle against V2 shift.
+ if (ShiftAmt < 0) {
+ ShiftAmt =
+ matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+ Mask, Size, Zeroable, Subtarget);
+ V = V2;
+ }
+
+ if (ShiftAmt < 0)
+ return SDValue();
+
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getBitcast(ShiftVT, V);
+ V = DAG.getNode(Opcode, DL, ShiftVT, V,
+ DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, V);
}
/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SelectionDAG &DAG) {
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
- assert(!Zeroable.all() && "Fully zeroable shuffle mask");
-
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+ assert(!Zeroable.all() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
if (!isUndefInRange(Mask, HalfSize, HalfSize))
@@ -8111,8 +9036,10 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
InputV = ShuffleOffset(InputV);
// For 256-bit vectors, we only need the lower (128-bit) input half.
- if (VT.is256BitVector())
- InputV = extract128BitVector(InputV, 0, DAG, DL);
+ // For 512-bit vectors, we only need the lower input half or quarter.
+ if (VT.getSizeInBits() > 128)
+ InputV = extractSubVector(InputV, 0, DAG, DL,
+ std::max(128, (int)VT.getSizeInBits() / Scale));
InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
return DAG.getBitcast(VT, InputV);
@@ -8231,9 +9158,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
/// are both incredibly common and often quite performance sensitive.
static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
+ const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
int Bits = VT.getSizeInBits();
int NumLanes = Bits / 128;
int NumElements = VT.getVectorNumElements();
@@ -8388,14 +9314,14 @@ static bool isShuffleFoldableLoad(SDValue V) {
/// across all subtarget feature sets.
static SDValue lowerVectorShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
- int V2Index = std::find_if(Mask.begin(), Mask.end(),
- [&Mask](int M) { return M >= (int)Mask.size(); }) -
- Mask.begin();
+ int V2Index =
+ find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
+ Mask.begin();
bool IsV1Zeroable = true;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (i != V2Index && !Zeroable[i]) {
@@ -8709,6 +9635,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
V = DAG.getBitcast(SrcVT, V);
}
+ // 32-bit targets need to load i64 as a f64 and then bitcast the result.
+ if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
+ V = DAG.getBitcast(MVT::f64, V);
+ unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
+ BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
+ }
+
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
@@ -8726,71 +9659,93 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- unsigned ZMask = 0;
- int V1DstIndex = -1;
- int V2DstIndex = -1;
- bool V1UsedInPlace = false;
- for (int i = 0; i < 4; ++i) {
- // Synthesize a zero mask from the zeroable elements (includes undefs).
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- continue;
- }
+ // Attempt to match INSERTPS with one element from VA or VB being
+ // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
+ // are updated.
+ auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
+ ArrayRef<int> CandidateMask) {
+ unsigned ZMask = 0;
+ int VADstIndex = -1;
+ int VBDstIndex = -1;
+ bool VAUsedInPlace = false;
+
+ for (int i = 0; i < 4; ++i) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
- // Flag if we use any V1 inputs in place.
- if (i == Mask[i]) {
- V1UsedInPlace = true;
- continue;
+ // Flag if we use any VA inputs in place.
+ if (i == CandidateMask[i]) {
+ VAUsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (VADstIndex >= 0 || VBDstIndex >= 0)
+ return false;
+
+ if (CandidateMask[i] < 4) {
+ // VA input out of place for insertion.
+ VADstIndex = i;
+ } else {
+ // VB input for insertion.
+ VBDstIndex = i;
+ }
}
- // We can only insert a single non-zeroable element.
- if (V1DstIndex >= 0 || V2DstIndex >= 0)
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (VADstIndex < 0 && VBDstIndex < 0)
return false;
- if (Mask[i] < 4) {
- // V1 input out of place for insertion.
- V1DstIndex = i;
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned VBSrcIndex = 0;
+ if (VADstIndex >= 0) {
+ // If we have a VA input out of place, we use VA as the V2 element
+ // insertion and don't use the original V2 at all.
+ VBSrcIndex = CandidateMask[VADstIndex];
+ VBDstIndex = VADstIndex;
+ VB = VA;
} else {
- // V2 input for insertion.
- V2DstIndex = i;
+ VBSrcIndex = CandidateMask[VBDstIndex] - 4;
}
- }
- // Don't bother if we have no (non-zeroable) element for insertion.
- if (V1DstIndex < 0 && V2DstIndex < 0)
- return false;
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!VAUsedInPlace)
+ VA = DAG.getUNDEF(MVT::v4f32);
- // Determine element insertion src/dst indices. The src index is from the
- // start of the inserted vector, not the start of the concatenated vector.
- unsigned V2SrcIndex = 0;
- if (V1DstIndex >= 0) {
- // If we have a V1 input out of place, we use V1 as the V2 element insertion
- // and don't use the original V2 at all.
- V2SrcIndex = Mask[V1DstIndex];
- V2DstIndex = V1DstIndex;
- V2 = V1;
- } else {
- V2SrcIndex = Mask[V2DstIndex] - 4;
- }
+ // Update V1, V2 and InsertPSMask accordingly.
+ V1 = VA;
+ V2 = VB;
- // If no V1 inputs are used in place, then the result is created only from
- // the zero mask and the V2 insertion - so remove V1 dependency.
- if (!V1UsedInPlace)
- V1 = DAG.getUNDEF(MVT::v4f32);
+ // Insert the V2 element into the desired position.
+ InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ return true;
+ };
- // Insert the V2 element into the desired position.
- InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
- return true;
+ if (matchAsInsertPS(V1, V2, Mask))
+ return true;
+
+ // Commute and try again.
+ SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+ if (matchAsInsertPS(V2, V1, CommutedMask))
+ return true;
+
+ return false;
}
static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
// Attempt to match the insertps pattern.
unsigned InsertPSMask;
@@ -8922,6 +9877,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
/// it is better to avoid lowering through this for integer vectors where
/// possible.
static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -8946,8 +9902,11 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
- return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
- DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ return DAG.getNode(
+ X86ISD::SHUFP, DL, MVT::v2f64,
+ Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+ Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
assert(Mask[1] >= 2 && "Non-canonicalized blend!");
@@ -8955,14 +9914,14 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have a single input, insert that into V1 if we can do so cheaply.
if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
+ DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
}
@@ -8980,7 +9939,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Subtarget.hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -9000,6 +9959,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -9052,19 +10012,19 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
+ DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
// We have different paths for blend lowering, but they all must use the
@@ -9072,7 +10032,7 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -9139,9 +10099,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 1) {
- int V2Index =
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
- Mask.begin();
+ int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
// Compute the index adjacent to V2Index and in the same half by toggling
// the low bit.
@@ -9220,6 +10178,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -9262,17 +10221,18 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (Subtarget.hasSSE41()) {
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Use INSERTPS if we can complete the shuffle efficiently.
- if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
+ if (SDValue V =
+ lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
if (!isSingleSHUFPSMask(Mask))
@@ -9301,6 +10261,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -9311,8 +10272,8 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
@@ -9341,13 +10302,13 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// We have different paths for blend lowering, but they all must use the
@@ -9355,11 +10316,11 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue Masked =
- lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -9374,26 +10335,31 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
- // If we have direct support for blends, we should lower by decomposing into
- // a permute. That will be faster than the domain cross.
- if (IsBlendSupported)
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
- Mask, DAG);
-
- // Try to lower by permuting the inputs into an unpack instruction.
- if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
- V2, Mask, DAG))
- return Unpack;
+ // Assume that a single SHUFPS is faster than an alternative sequence of
+ // multiple instructions (even if the CPU has a domain penalty).
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (!isSingleSHUFPSMask(Mask)) {
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+ Mask, DAG);
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+ DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Unpack;
+ }
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
// relevant.
- return DAG.getBitcast(
- MVT::v4i32,
- DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
- DAG.getBitcast(MVT::v4f32, V2), Mask));
+ SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
+ SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
+ return DAG.getBitcast(MVT::v4i32, ShufPS);
}
/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
@@ -9551,18 +10517,15 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
ArrayRef<int> Inputs) {
int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
- bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
- PinnedIdx ^ 1) != Inputs.end();
+ bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
// Determine whether the free index is in the flipped dword or the
// unflipped dword based on where the pinned index is. We use this bit
// in an xor to conditionally select the adjacent dword.
int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
- bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
- FixFreeIdx) != Inputs.end();
+ bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
if (IsFixIdxInput == IsFixFreeIdxInput)
FixFreeIdx += 1;
- IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
- FixFreeIdx) != Inputs.end();
+ IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
assert(IsFixIdxInput != IsFixFreeIdxInput &&
"We need to be changing the number of flipped inputs!");
int PSHUFHalfMask[] = {0, 1, 2, 3};
@@ -9734,9 +10697,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
// by inputs being moved and *staying* in that half.
if (IncomingInputs.size() == 1) {
if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
- int InputFixed = std::find(std::begin(SourceHalfMask),
- std::end(SourceHalfMask), -1) -
- std::begin(SourceHalfMask) + SourceOffset;
+ int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
+ SourceOffset;
SourceHalfMask[InputFixed - SourceOffset] =
IncomingInputs[0] - SourceOffset;
std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
@@ -9868,8 +10830,8 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
/// blend if only one input is used.
static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
+ bool &V2InUse) {
SDValue V1Mask[16];
SDValue V2Mask[16];
V1InUse = false;
@@ -9929,6 +10891,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -9939,7 +10902,7 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
@@ -9952,7 +10915,7 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -9978,18 +10941,19 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG))
return V;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// We have different paths for blend lowering, but they all must use the
@@ -9997,11 +10961,11 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
- if (SDValue Masked =
- lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -10027,14 +10991,14 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// can both shuffle and set up the inefficient blend.
if (!IsBlendSupported && Subtarget.hasSSSE3()) {
bool V1InUse, V2InUse;
- return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
- V1InUse, V2InUse);
+ return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, DAG, V1InUse, V2InUse);
}
// We can always bit-blend if we have to so the fallback strategy is to
// decompose into single-input permutes and blends.
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
- Mask, DAG);
+ Mask, DAG);
}
/// \brief Check whether a compaction lowering can be done by dropping even
@@ -10111,6 +11075,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -10120,7 +11085,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -10130,12 +11095,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, DAG))
return V;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
@@ -10238,8 +11204,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
}
- if (SDValue Masked =
- lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -10265,15 +11231,15 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool V2InUse = false;
SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
- DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
// If both V1 and V2 are in use and we can use a direct blend or an unpack,
// do so. This avoids using them to handle blends-with-zero which is
// important as a single pshufb is significantly faster for that.
if (V1InUse && V2InUse) {
if (Subtarget.hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Blend;
// We can use an unpack to do the blending rather than an or in some
@@ -10294,8 +11260,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (SDValue BitBlend =
@@ -10349,22 +11315,18 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// with a pack.
SDValue V = V1;
- int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
+ std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
for (int i = 0; i < 16; ++i)
if (Mask[i] >= 0)
(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
- SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
-
SDValue VLoHalf, VHiHalf;
// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
// them out and avoid using UNPCK{L,H} to extract the elements of V as
// i16s.
- if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
- [](int M) { return M >= 0 && M % 2 == 1; }) &&
- std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
- [](int M) { return M >= 0 && M % 2 == 1; })) {
+ if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
// Use a mask to drop the high bytes.
VLoHalf = DAG.getBitcast(MVT::v8i16, V);
VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
@@ -10383,6 +11345,8 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
} else {
// Otherwise just unpack the low half of V into VLoHalf and the high half into
// VHiHalf so that we can blend them as i16s.
+ SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
+
VLoHalf = DAG.getBitcast(
MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
VHiHalf = DAG.getBitcast(
@@ -10401,83 +11365,28 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// dispatches to the lowering routines accordingly.
static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
+ const SmallBitVector &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
switch (VT.SimpleTy) {
case MVT::v2i64:
- return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v2f64:
- return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i32:
- return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4f32:
- return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i16:
- return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i8:
- return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Unimplemented!");
}
}
-/// \brief Helper function to test whether a shuffle mask could be
-/// simplified by widening the elements being shuffled.
-///
-/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
-/// leaves it in an unspecified state.
-///
-/// NOTE: This must handle normal vector shuffle masks and *target* vector
-/// shuffle masks. The latter have the special property of a '-2' representing
-/// a zero-ed lane of a vector.
-static bool canWidenShuffleElements(ArrayRef<int> Mask,
- SmallVectorImpl<int> &WidenedMask) {
- WidenedMask.assign(Mask.size() / 2, 0);
- for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
- // If both elements are undef, its trivial.
- if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
- WidenedMask[i/2] = SM_SentinelUndef;
- continue;
- }
-
- // Check for an undef mask and a mask value properly aligned to fit with
- // a pair of values. If we find such a case, use the non-undef mask's value.
- if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
- WidenedMask[i/2] = Mask[i + 1] / 2;
- continue;
- }
- if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
- WidenedMask[i/2] = Mask[i] / 2;
- continue;
- }
-
- // When zeroing, we need to spread the zeroing across both lanes to widen.
- if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
- if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
- (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
- WidenedMask[i/2] = SM_SentinelZero;
- continue;
- }
- return false;
- }
-
- // Finally check if the two mask values are adjacent and aligned with
- // a pair.
- if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
- WidenedMask[i/2] = Mask[i] / 2;
- continue;
- }
-
- // Otherwise we can't safely widen the elements used in this shuffle.
- return false;
- }
- assert(WidenedMask.size() == Mask.size() / 2 &&
- "Incorrect size of mask after widening the elements!");
-
- return true;
-}
-
/// \brief Generic routine to split vector shuffle into half-sized shuffles.
///
/// This routine just extracts two subvectors, shuffles them independently, and
@@ -10712,15 +11621,20 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
/// \brief Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ SmallVector<int, 4> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ return SDValue();
+
// TODO: If minimizing size and one of the inputs is a zero vector and the
// the zero vector has only one use, we could use a VPERM2X128 to save the
// instruction bytes needed to explicitly generate the zero vector.
// Blends are faster and handle all the non-lane-crossing cases.
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
@@ -10761,15 +11675,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// [6] - ignore
// [7] - zero high half of destination
- int MaskLO = Mask[0];
- if (MaskLO == SM_SentinelUndef)
- MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
-
- int MaskHI = Mask[2];
- if (MaskHI == SM_SentinelUndef)
- MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+ int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
+ int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
- unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+ unsigned PermMask = MaskLO | (MaskHI << 4);
// If either input is a zero vector, replace it with an undef input.
// Shuffle mask values < 4 are selecting elements of V1.
@@ -10778,16 +11687,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// selecting the zero vector and setting the zero mask bit.
if (IsV1Zero) {
V1 = DAG.getUNDEF(VT);
- if (MaskLO < 4)
+ if (MaskLO < 2)
PermMask = (PermMask & 0xf0) | 0x08;
- if (MaskHI < 4)
+ if (MaskHI < 2)
PermMask = (PermMask & 0x0f) | 0x80;
}
if (IsV2Zero) {
V2 = DAG.getUNDEF(VT);
- if (MaskLO >= 4)
+ if (MaskLO >= 2)
PermMask = (PermMask & 0xf0) | 0x08;
- if (MaskHI >= 4)
+ if (MaskHI >= 2)
PermMask = (PermMask & 0x0f) | 0x80;
}
@@ -11178,35 +12087,65 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
SubLaneMask);
}
-static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &ShuffleImm,
+ ArrayRef<int> Mask) {
+ int NumElts = VT.getVectorNumElements();
+ assert(VT.getScalarType() == MVT::f64 &&
+ (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
+ "Unexpected data type for VSHUFPD");
// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
- assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
- int NumElts = VT.getVectorNumElements();
+ ShuffleImm = 0;
bool ShufpdMask = true;
bool CommutableMask = true;
- unsigned Immediate = 0;
for (int i = 0; i < NumElts; ++i) {
- if (Mask[i] < 0)
+ if (Mask[i] == SM_SentinelUndef)
continue;
+ if (Mask[i] < 0)
+ return false;
int Val = (i & 6) + NumElts * (i & 1);
- int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
- if (Mask[i] < Val || Mask[i] > Val + 1)
+ int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
+ if (Mask[i] < Val || Mask[i] > Val + 1)
ShufpdMask = false;
- if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
+ if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
CommutableMask = false;
- Immediate |= (Mask[i] % 2) << i;
+ ShuffleImm |= (Mask[i] % 2) << i;
}
+
if (ShufpdMask)
- return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
- DAG.getConstant(Immediate, DL, MVT::i8));
- if (CommutableMask)
- return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
- DAG.getConstant(Immediate, DL, MVT::i8));
- return SDValue();
+ return true;
+ if (CommutableMask) {
+ std::swap(V1, V2);
+ return true;
+ }
+
+ return false;
+}
+
+static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ unsigned Immediate = 0;
+ if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+ return SDValue();
+
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+ DAG.getConstant(Immediate, DL, MVT::i8));
+}
+
+static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+ SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+ if (V2.isUndef())
+ return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
}
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
@@ -11214,6 +12153,7 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11221,11 +12161,9 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- SmallVector<int, 4> WidenedMask;
- if (canWidenShuffleElements(Mask, WidenedMask))
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
- Subtarget, DAG))
- return V;
+ if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return V;
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
@@ -11268,7 +12206,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
@@ -11280,7 +12218,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// the results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
- return V;
+ return V;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
@@ -11291,6 +12229,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Result;
+ // If we have VLX support, we can use VEXPAND.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
@@ -11307,6 +12250,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11315,14 +12259,12 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
- SmallVector<int, 4> WidenedMask;
- if (canWidenShuffleElements(Mask, WidenedMask))
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
- Subtarget, DAG))
- return V;
+ if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return V;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
@@ -11352,9 +12294,25 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
+ // If we have VLX support, we can use VALIGN or VEXPAND.
+ if (Subtarget.hasVLX()) {
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
+ }
+
+ // Try to use PALIGNR.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
@@ -11364,8 +12322,8 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
- if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
- isShuffleMaskInputInPlace(1, Mask))))
+ if (!isShuffleMaskInputInPlace(0, Mask) &&
+ !isShuffleMaskInputInPlace(1, Mask))
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return Result;
@@ -11380,6 +12338,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11388,7 +12347,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
@@ -11432,17 +12391,12 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have a single input shuffle with different shuffle patterns in the
// two 128-bit lanes use the variable mask to VPERMILPS.
if (V2.isUndef()) {
- SDValue VPermMask[8];
- for (int i = 0; i < 8; ++i)
- VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
- : DAG.getConstant(Mask[i], DL, MVT::i32);
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
- return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
- DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
+ return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
if (Subtarget.hasAVX2())
- return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
- DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -11454,6 +12408,11 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Result;
+ // If we have VLX support, we can use VEXPAND.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
@@ -11470,6 +12429,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11481,12 +12441,12 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
@@ -11498,7 +12458,9 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// efficient instructions that mirror the shuffles across the two 128-bit
// lanes.
SmallVector<int, 4> RepeatedMask;
- if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
+ bool Is128BitLaneRepeatedShuffle =
+ is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
+ if (Is128BitLaneRepeatedShuffle) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
if (V2.isUndef())
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
@@ -11512,16 +12474,27 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
+ // If we have VLX support, we can use VALIGN or EXPAND.
+ if (Subtarget.hasVLX()) {
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
+ }
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
- // the results into the target lanes.
+ // results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return V;
@@ -11529,12 +12502,19 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If the shuffle patterns aren't repeated but it is a single input, directly
// generate a cross-lane VPERMD instruction.
if (V2.isUndef()) {
- SDValue VPermMask[8];
- for (int i = 0; i < 8; ++i)
- VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
- : DAG.getConstant(Mask[i], DL, MVT::i32);
- return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
- DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
+ }
+
+ // Assume that a single SHUFPS is faster than an alternative sequence of
+ // multiple instructions (even if the CPU has a domain penalty).
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+ SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
+ SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
+ CastV1, CastV2, DAG);
+ return DAG.getBitcast(MVT::v8i32, ShufPS);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -11553,6 +12533,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11564,8 +12545,8 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
@@ -11574,7 +12555,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Broadcast;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -11584,7 +12565,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -11615,10 +12596,14 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
}
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
- V2, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+ DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
return PSHUFB;
+ // AVX512BWVL can lower to VPERMW.
+ if (Subtarget.hasBWI() && Subtarget.hasVLX())
+ return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@@ -11634,6 +12619,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11645,8 +12631,8 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
- Mask, Subtarget, DAG))
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
@@ -11655,7 +12641,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Broadcast;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -11665,7 +12651,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -11685,8 +12671,8 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
DAG);
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
- V2, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+ DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
return PSHUFB;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -11706,6 +12692,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// together based on the available instructions.
static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
+ const SmallBitVector &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
@@ -11715,7 +12702,7 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (NumV2Elements == 1 && Mask[0] >= NumElts)
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- DL, VT, V1, V2, Mask, Subtarget, DAG))
+ DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
@@ -11734,7 +12721,8 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (ElementBits < 32) {
// No floating point type available, if we can't use the bit operations
// for masking/blending then decompose into 128-bit vectors.
- if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue V =
+ lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
return V;
if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
@@ -11750,17 +12738,17 @@ static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
switch (VT.SimpleTy) {
case MVT::v4f64:
- return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i64:
- return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8f32:
- return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i32:
- return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i16:
- return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i8:
- return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 256-bit x86 vector type!");
@@ -11782,57 +12770,81 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
+ // Check for patterns which can be matched with a single insert of a 256-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
+ {0, 1, 2, 3, 0, 1, 2, 3});
+ if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
+ {0, 1, 2, 3, 8, 9, 10, 11})) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+
+ assert(WidenedMask.size() == 4);
+
+ // See if this is an insertion of the lower 128-bits of V2 into V1.
+ bool IsInsert = true;
+ int V2Index = -1;
+ for (int i = 0; i < 4; ++i) {
+ assert(WidenedMask[i] >= -1);
+ if (WidenedMask[i] < 0)
+ continue;
+
+ // Make sure all V1 subvectors are in place.
+ if (WidenedMask[i] < 4) {
+ if (WidenedMask[i] != i) {
+ IsInsert = false;
+ break;
+ }
+ } else {
+ // Make sure we only have a single V2 index and its the lowest 128-bits.
+ if (V2Index >= 0 || WidenedMask[i] != 4) {
+ IsInsert = false;
+ break;
+ }
+ V2Index = i;
+ }
+ }
+ if (IsInsert && V2Index >= 0) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+ SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
+ DAG.getIntPtrConstant(0, DL));
+ return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
+ }
+
+ // Try to lower to to vshuf64x2/vshuf32x4.
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+ unsigned PermMask = 0;
// Insure elements came from the same Op.
- int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
- for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
- if (WidenedMask[i] == SM_SentinelZero)
- return SDValue();
- if (WidenedMask[i] == SM_SentinelUndef)
+ for (int i = 0; i < 4; ++i) {
+ assert(WidenedMask[i] >= -1);
+ if (WidenedMask[i] < 0)
continue;
- SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
- unsigned OpIndex = (i < Size/2) ? 0 : 1;
+ SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
+ unsigned OpIndex = i / 2;
if (Ops[OpIndex].isUndef())
Ops[OpIndex] = Op;
else if (Ops[OpIndex] != Op)
return SDValue();
- }
-
- // Form a 128-bit permutation.
- // Convert the 64-bit shuffle mask selection values into 128-bit selection
- // bits defined by a vshuf64x2 instruction's immediate control byte.
- unsigned PermMask = 0, Imm = 0;
- unsigned ControlBitsNum = WidenedMask.size() / 2;
- for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
- // Use first element in place of undef mask.
- Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
- PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+ // Convert the 128-bit shuffle mask selection values into 128-bit selection
+ // bits defined by a vshuf64x2 instruction's immediate control byte.
+ PermMask |= (WidenedMask[i] % 4) << (i * 2);
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
DAG.getConstant(PermMask, DL, MVT::i8));
}
-static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
-
- assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
-
- MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
- MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
-
- SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
- if (V2.isUndef())
- return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
-
- return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
-}
-
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11875,11 +12887,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Op;
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
+ V2, DAG, Subtarget))
+ return V;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11911,12 +12928,17 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
// Otherwise, fall back to a SHUFPS sequence.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11951,18 +12973,33 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
+ // Try to use VALIGN.
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to use PALIGNR.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
+ V2, DAG, Subtarget))
+ return V;
return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11970,11 +13007,20 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
// If the shuffle mask is repeated in each 128-bit lane we can use more
// efficient instructions that mirror the shuffles across the four 128-bit
// lanes.
SmallVector<int, 4> RepeatedMask;
- if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
+ bool Is128BitLaneRepeatedShuffle =
+ is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
+ if (Is128BitLaneRepeatedShuffle) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
if (V2.isUndef())
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
@@ -11988,20 +13034,40 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
+ // Try to use VALIGN.
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
// Try to use byte rotation instructions.
if (Subtarget.hasBWI())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // Assume that a single SHUFPS is faster than using a permv shuffle.
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+ SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
+ SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
+ CastV1, CastV2, DAG);
+ return DAG.getBitcast(MVT::v16i32, ShufPS);
+ }
+ // If we have AVX512F support, we can use VEXPAND.
+ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
+ V1, V2, DAG, Subtarget))
+ return V;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12010,6 +13076,13 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
@@ -12017,7 +13090,7 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -12041,6 +13114,7 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const SmallBitVector &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12049,6 +13123,13 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return ZExt;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
@@ -12056,7 +13137,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
- Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -12064,10 +13145,20 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;
- if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
- V2, Subtarget, DAG))
+ if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+ DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
return PSHUFB;
+ // VBMI can use VPERMV/VPERMV3 byte shuffles.
+ if (Subtarget.hasVBMI())
+ return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
@@ -12079,11 +13170,22 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// together based on the available instructions.
static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
+ const SmallBitVector &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = Mask.size();
+ int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return Insertion;
+
// Check for being able to broadcast a single element.
if (SDValue Broadcast =
lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
@@ -12095,17 +13197,17 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16f32:
- return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:
- return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:
- return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i16:
- return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64i8:
- return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+ return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 512-bit x86 vector type!");
@@ -12161,9 +13263,81 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
else
V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
- return DAG.getNode(ISD::TRUNCATE, DL, VT,
- DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
+
+ SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
+ // i1 was sign extended we can use X86ISD::CVT2MASK.
+ int NumElems = VT.getVectorNumElements();
+ if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
+ (Subtarget.hasDQI() && (NumElems < 32)))
+ return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
+
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
}
+
+/// Helper function that returns true if the shuffle mask should be
+/// commuted to improve canonicalization.
+static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
+ int NumElements = Mask.size();
+
+ int NumV1Elements = 0, NumV2Elements = 0;
+ for (int M : Mask)
+ if (M < 0)
+ continue;
+ else if (M < NumElements)
+ ++NumV1Elements;
+ else
+ ++NumV2Elements;
+
+ // Commute the shuffle as needed such that more elements come from V1 than
+ // V2. This allows us to match the shuffle pattern strictly on how many
+ // elements come from V1 without handling the symmetric cases.
+ if (NumV2Elements > NumV1Elements)
+ return true;
+
+ assert(NumV1Elements > 0 && "No V1 indices");
+
+ if (NumV2Elements == 0)
+ return false;
+
+ // When the number of V1 and V2 elements are the same, try to minimize the
+ // number of uses of V2 in the low half of the vector. When that is tied,
+ // ensure that the sum of indices for V1 is equal to or lower than the sum
+ // indices for V2. When those are equal, try to ensure that the number of odd
+ // indices for V1 is lower than the number of odd indices for V2.
+ if (NumV1Elements == NumV2Elements) {
+ int LowV1Elements = 0, LowV2Elements = 0;
+ for (int M : Mask.slice(0, NumElements / 2))
+ if (M >= NumElements)
+ ++LowV2Elements;
+ else if (M >= 0)
+ ++LowV1Elements;
+ if (LowV2Elements > LowV1Elements)
+ return true;
+ if (LowV2Elements == LowV1Elements) {
+ int SumV1Indices = 0, SumV2Indices = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= NumElements)
+ SumV2Indices += i;
+ else if (Mask[i] >= 0)
+ SumV1Indices += i;
+ if (SumV2Indices < SumV1Indices)
+ return true;
+ if (SumV2Indices == SumV1Indices) {
+ int NumV1OddIndices = 0, NumV2OddIndices = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= NumElements)
+ NumV2OddIndices += i % 2;
+ else if (Mask[i] >= 0)
+ NumV1OddIndices += i % 2;
+ if (NumV2OddIndices < NumV1OddIndices)
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
/// \brief Top-level lowering for x86 vector shuffles.
///
/// This handles decomposition, canonicalization, and lowering of all x86
@@ -12209,6 +13383,12 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
}
+ // Check for illegal shuffle mask element index values.
+ int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
+ assert(llvm::all_of(Mask,
+ [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
+ "Out of bounds shuffle index");
+
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
@@ -12237,69 +13417,22 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
}
}
- int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
- for (int M : Mask)
- if (M < 0)
- ++NumUndefElements;
- else if (M < NumElements)
- ++NumV1Elements;
- else
- ++NumV2Elements;
-
- // Commute the shuffle as needed such that more elements come from V1 than
- // V2. This allows us to match the shuffle pattern strictly on how many
- // elements come from V1 without handling the symmetric cases.
- if (NumV2Elements > NumV1Elements)
+ // Commute the shuffle if it will improve canonicalization.
+ if (canonicalizeShuffleMaskWithCommute(Mask))
return DAG.getCommutedVectorShuffle(*SVOp);
- assert(NumV1Elements > 0 && "No V1 indices");
- assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
-
- // When the number of V1 and V2 elements are the same, try to minimize the
- // number of uses of V2 in the low half of the vector. When that is tied,
- // ensure that the sum of indices for V1 is equal to or lower than the sum
- // indices for V2. When those are equal, try to ensure that the number of odd
- // indices for V1 is lower than the number of odd indices for V2.
- if (NumV1Elements == NumV2Elements) {
- int LowV1Elements = 0, LowV2Elements = 0;
- for (int M : Mask.slice(0, NumElements / 2))
- if (M >= NumElements)
- ++LowV2Elements;
- else if (M >= 0)
- ++LowV1Elements;
- if (LowV2Elements > LowV1Elements)
- return DAG.getCommutedVectorShuffle(*SVOp);
- if (LowV2Elements == LowV1Elements) {
- int SumV1Indices = 0, SumV2Indices = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= NumElements)
- SumV2Indices += i;
- else if (Mask[i] >= 0)
- SumV1Indices += i;
- if (SumV2Indices < SumV1Indices)
- return DAG.getCommutedVectorShuffle(*SVOp);
- if (SumV2Indices == SumV1Indices) {
- int NumV1OddIndices = 0, NumV2OddIndices = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= NumElements)
- NumV2OddIndices += i % 2;
- else if (Mask[i] >= 0)
- NumV1OddIndices += i % 2;
- if (NumV2OddIndices < NumV1OddIndices)
- return DAG.getCommutedVectorShuffle(*SVOp);
- }
- }
- }
-
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
- return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
+ return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+ DAG);
if (VT.is256BitVector())
- return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
+ return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+ DAG);
if (VT.is512BitVector())
- return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
+ return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+ DAG);
if (Is1BitVector)
return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
@@ -12392,21 +13525,6 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
}
- if (VT.getSizeInBits() == 16) {
- // If Idx is 0, it's cheaper to do a move instead of a pextrw.
- if (isNullConstant(Op.getOperand(1)))
- return DAG.getNode(
- ISD::TRUNCATE, dl, MVT::i16,
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
- Op.getOperand(1)));
- SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
- Op.getOperand(0), Op.getOperand(1));
- SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
- DAG.getValueType(VT));
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
- }
-
if (VT == MVT::f32) {
// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
// the result back to FR32 register. It's only worth matching if the
@@ -12432,6 +13550,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
if (isa<ConstantSDNode>(Op.getOperand(1)))
return Op;
}
+
return SDValue();
}
@@ -12460,7 +13579,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
}
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
+ if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
+ (VecVT.getVectorNumElements() < 8)) {
// Use kshiftlw/rw instruction.
VecVT = MVT::v16i1;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
@@ -12469,8 +13589,9 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
DAG.getIntPtrConstant(0, dl));
}
unsigned MaxSift = VecVT.getVectorNumElements() - 1;
- Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
- DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
+ if (MaxSift - IdxVal)
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift, dl, MVT::i8));
return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
@@ -12491,10 +13612,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
if (!isa<ConstantSDNode>(Idx)) {
if (VecVT.is512BitVector() ||
(VecVT.is256BitVector() && Subtarget.hasInt256() &&
- VecVT.getVectorElementType().getSizeInBits() == 32)) {
+ VecVT.getScalarSizeInBits() == 32)) {
MVT MaskEltVT =
- MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+ MVT::getIntegerVT(VecVT.getScalarSizeInBits());
MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
MaskEltVT.getSizeInBits());
@@ -12531,26 +13652,31 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
assert(VecVT.is128BitVector() && "Unexpected vector length");
- if (Subtarget.hasSSE41())
- if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
- return Res;
-
MVT VT = Op.getSimpleValueType();
- // TODO: handle v16i8.
+
if (VT.getSizeInBits() == 16) {
- if (IdxVal == 0)
+ // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
+ // we're going to zero extend the register or fold the store (SSE41 only).
+ if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
+ !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
// Transform it so it match pextrw which produces a 32-bit result.
- MVT EltVT = MVT::i32;
- SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
- SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
}
+ if (Subtarget.hasSSE41())
+ if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
+ return Res;
+
+ // TODO: handle v16i8.
+
if (VT.getSizeInBits() == 32) {
if (IdxVal == 0)
return Op;
@@ -12604,12 +13730,46 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
- if (IdxVal)
+ unsigned NumElems = VecVT.getVectorNumElements();
+
+ if(Vec.isUndef()) {
+ if (IdxVal)
+ EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ return EltInVec;
+ }
+
+ // Insertion of one bit into first or last position
+ // can be done with two SHIFTs + OR.
+ if (IdxVal == 0 ) {
+ // EltInVec already at correct index and other bits are 0.
+ // Clean the first bit in source vector.
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ DAG.getConstant(1 , dl, MVT::i8));
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(1, dl, MVT::i8));
+
+ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+ }
+ if (IdxVal == NumElems -1) {
+ // Move the bit to the last position inside the vector.
EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
- if (Vec.isUndef())
- return EltInVec;
- return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+ // Clean the last bit in the source vector.
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ DAG.getConstant(1, dl, MVT::i8));
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ DAG.getConstant(1 , dl, MVT::i8));
+
+ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+ }
+
+ // Use shuffle to insert element.
+ SmallVector<int, 64> MaskVec(NumElems);
+ for (unsigned i = 0; i != NumElems; ++i)
+ MaskVec[i] = (i == IdxVal) ? NumElems : i;
+
+ return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
}
SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
@@ -12764,10 +13924,6 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
- if (OpVT == MVT::v1i64 &&
- Op.getOperand(0).getValueType() == MVT::i64)
- return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
-
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
assert(OpVT.is128BitVector() && "Expected an SSE type!");
return DAG.getBitcast(
@@ -12779,25 +13935,32 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
// upper bits of a vector.
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
+
SDLoc dl(Op);
SDValue In = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- MVT ResVT = Op.getSimpleValueType();
- MVT InVT = In.getSimpleValueType();
+ MVT ResVT = Op.getSimpleValueType();
- if (Subtarget.hasFp256()) {
- if (ResVT.is128BitVector() &&
- (InVT.is256BitVector() || InVT.is512BitVector()) &&
- isa<ConstantSDNode>(Idx)) {
- return extract128BitVector(In, IdxVal, DAG, dl);
- }
- if (ResVT.is256BitVector() && InVT.is512BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- return extract256BitVector(In, IdxVal, DAG, dl);
- }
- }
- return SDValue();
+ assert((In.getSimpleValueType().is256BitVector() ||
+ In.getSimpleValueType().is512BitVector()) &&
+ "Can only extract from 256-bit or 512-bit vectors");
+
+ if (ResVT.is128BitVector())
+ return extract128BitVector(In, IdxVal, DAG, dl);
+ if (ResVT.is256BitVector())
+ return extract256BitVector(In, IdxVal, DAG, dl);
+
+ llvm_unreachable("Unimplemented!");
+}
+
+static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
+ for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
+ if (llvm::all_of(ValidUsers,
+ [&I](SDValue V) { return V.getNode() != *I; }))
+ return false;
+ return true;
}
// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
@@ -12805,58 +13968,97 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- if (!Subtarget.hasAVX())
- return SDValue();
+ assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
SDValue SubVec = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
- if (!isa<ConstantSDNode>(Idx))
- return SDValue();
-
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
MVT OpVT = Op.getSimpleValueType();
MVT SubVecVT = SubVec.getSimpleValueType();
- // Fold two 16-byte subvector loads into one 32-byte load:
- // (insert_subvector (insert_subvector undef, (load addr), 0),
- // (load addr + 16), Elts/2)
+ if (OpVT.getVectorElementType() == MVT::i1)
+ return insert1BitVector(Op, DAG, Subtarget);
+
+ assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+ "Can only insert into 256-bit or 512-bit vectors");
+
+ // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+ // load:
+ // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+ // (load16 addr + 16), Elts/2)
// --> load32 addr
+ // or:
+ // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+ // (load32 addr + 32), Elts/2)
+ // --> load64 addr
+ // or a 16-byte or 32-byte broadcast:
+ // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+ // (load16 addr), Elts/2)
+ // --> X86SubVBroadcast(load16 addr)
+ // or:
+ // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+ // (load32 addr), Elts/2)
+ // --> X86SubVBroadcast(load32 addr)
if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
+ OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
if (Idx2 && Idx2->getZExtValue() == 0) {
+ SDValue SubVec2 = Vec.getOperand(1);
// If needed, look through bitcasts to get to the load.
- SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
- if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
bool Fast;
unsigned Alignment = FirstLd->getAlignment();
unsigned AS = FirstLd->getAddressSpace();
const X86TargetLowering *TLI = Subtarget.getTargetLowering();
if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
OpVT, AS, Alignment, &Fast) && Fast) {
- SDValue Ops[] = { SubVec2, SubVec };
+ SDValue Ops[] = {SubVec2, SubVec};
if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
return Ld;
}
}
+ // If lower/upper loads are the same and the only users of the load, then
+ // lower to a VBROADCASTF128/VBROADCASTI128/etc.
+ if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+ if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
+ areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
+ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
+ }
+ }
+ // If this is subv_broadcast insert into both halves, use a larger
+ // subv_broadcast.
+ if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
+ SubVec.getOperand(0));
+ }
}
}
- if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
- SubVecVT.is128BitVector())
+ if (SubVecVT.is128BitVector())
return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
- if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+ if (SubVecVT.is256BitVector())
return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
- if (OpVT.getVectorElementType() == MVT::i1)
- return insert1BitVector(Op, DAG, Subtarget);
+ llvm_unreachable("Unimplemented!");
+}
- return SDValue();
+// Returns the appropriate wrapper opcode for a global reference.
+unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
+ // References to absolute symbols are never PC-relative.
+ if (GV && GV->isAbsoluteSymbolRef())
+ return X86ISD::Wrapper;
+
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ if (Subtarget.isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ return X86ISD::WrapperRIP;
+
+ return X86ISD::Wrapper;
}
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
@@ -12872,18 +14074,12 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
- unsigned WrapperKind = X86ISD::Wrapper;
- CodeModel::Model M = DAG.getTarget().getCodeModel();
-
- if (Subtarget.isPICStyleRIPRel() &&
- (M == CodeModel::Small || M == CodeModel::Kernel))
- WrapperKind = X86ISD::WrapperRIP;
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetConstantPool(
CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
SDLoc DL(CP);
- Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+ Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (OpFlag) {
Result =
@@ -12900,17 +14096,11 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
- unsigned WrapperKind = X86ISD::Wrapper;
- CodeModel::Model M = DAG.getTarget().getCodeModel();
-
- if (Subtarget.isPICStyleRIPRel() &&
- (M == CodeModel::Small || M == CodeModel::Kernel))
- WrapperKind = X86ISD::WrapperRIP;
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
SDLoc DL(JT);
- Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+ Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (OpFlag)
@@ -12929,18 +14119,12 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
// global base reg.
const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
- unsigned WrapperKind = X86ISD::Wrapper;
- CodeModel::Model M = DAG.getTarget().getCodeModel();
-
- if (Subtarget.isPICStyleRIPRel() &&
- (M == CodeModel::Small || M == CodeModel::Kernel))
- WrapperKind = X86ISD::WrapperRIP;
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
SDLoc DL(Op);
- Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+ Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (isPositionIndependent() && !Subtarget.is64Bit()) {
@@ -12963,18 +14147,12 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
// Create the TargetBlockAddressAddress node.
unsigned char OpFlags =
Subtarget.classifyBlockAddressReference();
- CodeModel::Model M = DAG.getTarget().getCodeModel();
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
-
- if (Subtarget.isPICStyleRIPRel() &&
- (M == CodeModel::Small || M == CodeModel::Kernel))
- Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
- else
- Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
+ Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (isGlobalRelativeToPICBase(OpFlags)) {
@@ -13003,11 +14181,7 @@ SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
}
- if (Subtarget.isPICStyleRIPRel() &&
- (M == CodeModel::Small || M == CodeModel::Kernel))
- Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
- else
- Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
+ Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (isGlobalRelativeToPICBase(OpFlags)) {
@@ -13041,7 +14215,7 @@ static SDValue
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
unsigned char OperandFlags, bool LocalDynamic = false) {
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDLoc dl(GA);
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
@@ -13061,8 +14235,8 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
- MFI->setAdjustsStack(true);
- MFI->setHasCalls(true);
+ MFI.setAdjustsStack(true);
+ MFI.setHasCalls(true);
SDValue Flag = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
@@ -13097,7 +14271,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
SDLoc dl(GA);
// Get the start address of the TLS block for this module.
- X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+ X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
.getInfo<X86MachineFunctionInfo>();
MFI->incNumLocalDynamicTLSAccesses();
@@ -13251,8 +14425,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
Chain.getValue(1), DL);
// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- MFI->setAdjustsStack(true);
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setAdjustsStack(true);
// And our return value (tls address) is in the standard call return value
// location.
@@ -13395,9 +14569,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
- return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
+ return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
- DAG.getUNDEF(SrcVT)));
+ DAG.getUNDEF(SrcVT)));
}
if (SrcVT.getVectorElementType() == MVT::i1) {
if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
@@ -13433,7 +14607,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
unsigned Size = SrcVT.getSizeInBits()/8;
MachineFunction &MF = DAG.getMachineFunction();
auto PtrVT = getPointerTy(MF.getDataLayout());
- int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+ int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
SDValue Chain = DAG.getStore(
DAG.getEntryNode(), dl, ValueToStore, StackSlot,
@@ -13479,8 +14653,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
// shouldn't be necessary except that RFP cannot be live across
// multiple blocks. When stackifier is fixed, they can be uncoupled.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
- int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
+ unsigned SSFISize = Op.getValueSizeInBits()/8;
+ int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
auto PtrVT = getPointerTy(MF.getDataLayout());
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Tys = DAG.getVTList(MVT::Other);
@@ -13528,10 +14702,10 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
SmallVector<Constant*,2> CV1;
CV1.push_back(
- ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
APInt(64, 0x4330000000000000ULL))));
CV1.push_back(
- ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
@@ -13560,8 +14734,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
- SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
- S2F, 0x4E, DAG);
+ SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
}
@@ -13617,6 +14790,41 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
return Sub;
}
+static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget, SDLoc &DL) {
+ if (Op.getSimpleValueType() != MVT::v2f64)
+ return SDValue();
+
+ SDValue N0 = Op.getOperand(0);
+ assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
+
+ // Legalize to v4i32 type.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+
+ if (Subtarget.hasAVX512())
+ return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
+
+ // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
+ // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
+ SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
+ SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
+
+ // Two to the power of half-word-size.
+ SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
+
+ // Clear upper part of LO, lower HI.
+ SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
+ SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
+
+ SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
+ fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
+ SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
+
+ // Add the two halves.
+ return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
+}
+
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// The algorithm is the following:
@@ -13699,7 +14907,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
SDValue VecCstFAdd = DAG.getConstantFP(
- APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
+ APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
@@ -13714,29 +14922,31 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
SelectionDAG &DAG) const {
SDValue N0 = Op.getOperand(0);
- MVT SVT = N0.getSimpleValueType();
+ MVT SrcVT = N0.getSimpleValueType();
SDLoc dl(Op);
- if (SVT.getVectorElementType() == MVT::i1) {
- if (SVT == MVT::v2i1)
+ if (SrcVT.getVectorElementType() == MVT::i1) {
+ if (SrcVT == MVT::v2i1)
return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
- MVT IntegerVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+ MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
}
- switch (SVT.SimpleTy) {
+ switch (SrcVT.SimpleTy) {
default:
llvm_unreachable("Custom UINT_TO_FP is not supported!");
case MVT::v4i8:
case MVT::v4i16:
case MVT::v8i8:
case MVT::v8i16: {
- MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+ MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
}
+ case MVT::v2i32:
+ return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
case MVT::v4i32:
case MVT::v8i32:
return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
@@ -13754,15 +14964,15 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
- if (Op.getSimpleValueType().isVector())
- return lowerUINT_TO_FP_vec(Op, DAG);
-
// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
// the optimization here.
if (DAG.SignBitIsZero(N0))
return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+ if (Op.getSimpleValueType().isVector())
+ return lowerUINT_TO_FP_vec(Op, DAG);
+
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
@@ -13903,7 +15113,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
unsigned MemSize = DstTy.getSizeInBits()/8;
- int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+ int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
unsigned Opc;
@@ -13935,15 +15145,15 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// For X87 we'd like to use the smallest FP type for this constant, but
// for DAG type consistency we have to match the FP operand type.
- APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
+ APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
bool LosesInfo = false;
if (TheVT == MVT::f64)
// The rounding mode is irrelevant as the conversion should be exact.
- Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+ Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
&LosesInfo);
else if (TheVT == MVT::f80)
- Status = Thresh.convert(APFloat::x87DoubleExtended,
+ Status = Thresh.convert(APFloat::x87DoubleExtended(),
APFloat::rmNearestTiesToEven, &LosesInfo);
assert(Status == APFloat::opOK && !LosesInfo &&
@@ -13981,7 +15191,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
MachineMemOperand::MOLoad, MemSize, MemSize);
Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
Chain = Value.getValue(1);
- SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+ SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
}
@@ -14084,14 +15294,14 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc DL(Op);
- unsigned int NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
- return SDValue();
+ unsigned NumElts = VT.getVectorNumElements();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
+ (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
- assert(InVT.getVectorElementType() == MVT::i1);
+ if (InVT.getVectorElementType() != MVT::i1)
+ return SDValue();
// Extend VT if the target is 256 or 128bit vector and VLX is not supported.
MVT ExtVT = VT;
@@ -14137,6 +15347,85 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
}
+/// Helper to recursively truncate vector elements in half with PACKSS.
+/// It makes use of the fact that vector comparison results will be all-zeros
+/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
+/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
+/// within each 128-bit lane.
+static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
+ const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Requires SSE2 but AVX512 has fast truncate.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ return SDValue();
+
+ EVT SrcVT = In.getValueType();
+
+ // No truncation required, we might get here due to recursive calls.
+ if (SrcVT == DstVT)
+ return In;
+
+ // We only support vector truncation to 128bits or greater from a
+ // 256bits or greater source.
+ if ((DstVT.getSizeInBits() % 128) != 0)
+ return SDValue();
+ if ((SrcVT.getSizeInBits() % 256) != 0)
+ return SDValue();
+
+ unsigned NumElems = SrcVT.getVectorNumElements();
+ assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
+ assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
+
+ EVT PackedSVT =
+ EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
+
+ // Extract lower/upper subvectors.
+ unsigned NumSubElts = NumElems / 2;
+ unsigned SrcSizeInBits = SrcVT.getSizeInBits();
+ SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+ SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+
+ // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
+ if (SrcVT.is256BitVector()) {
+ Lo = DAG.getBitcast(MVT::v8i16, Lo);
+ Hi = DAG.getBitcast(MVT::v8i16, Hi);
+ SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
+ return DAG.getBitcast(DstVT, Res);
+ }
+
+ // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
+ // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
+ if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
+ Lo = DAG.getBitcast(MVT::v16i16, Lo);
+ Hi = DAG.getBitcast(MVT::v16i16, Hi);
+ SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
+
+ // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
+ // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
+ Res = DAG.getBitcast(MVT::v4i64, Res);
+ Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
+
+ if (DstVT.is256BitVector())
+ return DAG.getBitcast(DstVT, Res);
+
+ // If 512bit -> 128bit truncate another stage.
+ EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
+ Res = DAG.getBitcast(PackedVT, Res);
+ return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
+ }
+
+ // Recursively pack lower/upper subvectors, concat result and pack again.
+ assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
+ EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
+ Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
+ Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
+
+ PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
+ return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
+}
+
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -14203,6 +15492,22 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
}
+
+ // Truncate with PACKSS if we are truncating a vector comparison result.
+ // TODO: We should be able to support other operations as long as we
+ // we are saturating+packing zero/all bits only.
+ auto IsPackableComparison = [](SDValue V) {
+ unsigned Opcode = V.getOpcode();
+ return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
+ Opcode == X86ISD::CMPP);
+ };
+
+ if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
+ all_of(In->ops(), IsPackableComparison))) {
+ if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
+ return V;
+ }
+
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
@@ -14299,30 +15604,31 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
DAG.getIntPtrConstant(0, DL));
}
-SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
- SelectionDAG &DAG) const {
- assert(!Op.getSimpleValueType().isVector());
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) const {
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
- std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
- /*IsSigned=*/ true, /*IsReplace=*/ false);
- SDValue FIST = Vals.first, StackSlot = Vals.second;
- // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
- if (!FIST.getNode())
- return Op;
+ MVT VT = Op.getSimpleValueType();
- if (StackSlot.getNode())
- // Load the result.
- return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
- MachinePointerInfo());
+ if (VT.isVector()) {
+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
+ SDValue Src = Op.getOperand(0);
+ SDLoc dl(Op);
+ if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
+ return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
+ dl, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32)));
+ }
- // The node is the result.
- return FIST;
-}
+ return SDValue();
+ }
+
+ assert(!VT.isVector());
-SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
- SelectionDAG &DAG) const {
std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
- /*IsSigned=*/ false, /*IsReplace=*/ false);
+ IsSigned, /*IsReplace=*/ false);
SDValue FIST = Vals.first, StackSlot = Vals.second;
// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
if (!FIST.getNode())
@@ -14330,8 +15636,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
if (StackSlot.getNode())
// Load the result.
- return DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot,
- MachinePointerInfo());
+ return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
// The node is the result.
return FIST;
@@ -14376,17 +15681,14 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
MVT LogicVT;
MVT EltVT;
- unsigned NumElts;
if (VT.isVector()) {
LogicVT = VT;
EltVT = VT.getVectorElementType();
- NumElts = VT.getVectorNumElements();
} else if (IsF128) {
// SSE instructions are used for optimized f128 logical operations.
LogicVT = MVT::f128;
EltVT = VT;
- NumElts = 1;
} else {
// There are no scalar bitwise logical SSE/AVX instructions, so we
// generate a 16-byte vector constant and logic op even for the scalar case.
@@ -14394,22 +15696,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
// the logic op, so it can save (~4 bytes) on code size.
LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
EltVT = VT;
- NumElts = (VT == MVT::f64) ? 2 : 4;
}
unsigned EltBits = EltVT.getSizeInBits();
- LLVMContext *Context = DAG.getContext();
// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
APInt MaskElt =
IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
- Constant *C = ConstantInt::get(*Context, MaskElt);
- C = ConstantVector::getSplat(NumElts, C);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
- unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
- SDValue Mask = DAG.getLoad(
- LogicVT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
+ const fltSemantics &Sem =
+ EltVT == MVT::f64 ? APFloat::IEEEdouble() :
+ (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+ SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
SDValue Op0 = Op.getOperand(0);
bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
@@ -14429,92 +15725,73 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- LLVMContext *Context = DAG.getContext();
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
+ SDValue Mag = Op.getOperand(0);
+ SDValue Sign = Op.getOperand(1);
SDLoc dl(Op);
+
+ // If the sign operand is smaller, extend it first.
MVT VT = Op.getSimpleValueType();
- MVT SrcVT = Op1.getSimpleValueType();
- bool IsF128 = (VT == MVT::f128);
+ if (Sign.getSimpleValueType().bitsLT(VT))
+ Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
- // If second operand is smaller, extend it first.
- if (SrcVT.bitsLT(VT)) {
- Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
- SrcVT = VT;
- }
// And if it is bigger, shrink it first.
- if (SrcVT.bitsGT(VT)) {
- Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
- SrcVT = VT;
- }
+ if (Sign.getSimpleValueType().bitsGT(VT))
+ Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
- assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+ bool IsF128 = (VT == MVT::f128);
+ assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+ VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
"Unexpected type in LowerFCOPYSIGN");
+ MVT EltVT = VT.getScalarType();
const fltSemantics &Sem =
- VT == MVT::f64 ? APFloat::IEEEdouble :
- (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
- const unsigned SizeInBits = VT.getSizeInBits();
+ EltVT == MVT::f64 ? APFloat::IEEEdouble()
+ : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+
+ // Perform all scalar logic operations as 16-byte vectors because there are no
+ // scalar FP logic instructions in SSE.
+ // TODO: This isn't necessary. If we used scalar types, we might avoid some
+ // unnecessary splats, but we might miss load folding opportunities. Should
+ // this decision be based on OptimizeForSize?
+ bool IsFakeVector = !VT.isVector() && !IsF128;
+ MVT LogicVT = VT;
+ if (IsFakeVector)
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
- SmallVector<Constant *, 4> CV(
- VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
- ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
+ // The mask constants are automatically splatted for vector types.
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ SDValue SignMask = DAG.getConstantFP(
+ APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+ SDValue MagMask = DAG.getConstantFP(
+ APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
// First, clear all bits but the sign bit from the second operand (sign).
- CV[0] = ConstantFP::get(*Context,
- APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
- Constant *C = ConstantVector::get(CV);
- auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
- SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-
- // Perform all logic operations as 16-byte vectors because there are no
- // scalar FP logic instructions in SSE. This allows load folding of the
- // constants into the logic instructions.
- MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
- SDValue Mask1 =
- DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- /* Alignment = */ 16);
- if (!IsF128)
- Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
- SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
+ if (IsFakeVector)
+ Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
// Next, clear the sign bit from the first operand (magnitude).
- // If it's a constant, we can clear it here.
- if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
+ // TODO: If we had general constant folding for FP logic ops, this check
+ // wouldn't be necessary.
+ SDValue MagBits;
+ if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
APFloat APF = Op0CN->getValueAPF();
- // If the magnitude is a positive zero, the sign bit alone is enough.
- if (APF.isPosZero())
- return IsF128 ? SignBit :
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
- DAG.getIntPtrConstant(0, dl));
APF.clearSign();
- CV[0] = ConstantFP::get(*Context, APF);
+ MagBits = DAG.getConstantFP(APF, dl, LogicVT);
} else {
- CV[0] = ConstantFP::get(
- *Context,
- APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
- }
- C = ConstantVector::get(CV);
- CPIdx = DAG.getConstantPool(C, PtrVT, 16);
- SDValue Val =
- DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- /* Alignment = */ 16);
- // If the magnitude operand wasn't a constant, we need to AND out the sign.
- if (!isa<ConstantFPSDNode>(Op0)) {
- if (!IsF128)
- Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
- Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+ // If the magnitude operand wasn't a constant, we need to AND out the sign.
+ if (IsFakeVector)
+ Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
+ MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
}
+
// OR the magnitude value with the sign bit.
- Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
- return IsF128 ? Val :
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
- DAG.getIntPtrConstant(0, dl));
+ SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
+ return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
+ DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -14741,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
}
}
+ // Sometimes flags can be set either with an AND or with an SRL/SHL
+ // instruction. SRL/SHL variant should be preferred for masks longer than this
+ // number of bits.
+ const int ShiftToAndMaxMaskWidth = 32;
+ const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
+
// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
// which may be the result of a CAST. We use the variable 'Op', which is the
// non-casted variable when we check for possible users.
@@ -14764,7 +16047,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
goto default_case;
if (ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
+ dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
// An add of one will be selected as an INC.
if (C->isOne() && !Subtarget.slowIncDec()) {
Opcode = X86ISD::INC;
@@ -14789,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
// If we have a constant logical shift that's only used in a comparison
// against zero turn it into an equivalent AND. This allows turning it into
// a TEST instruction later.
- if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+ if (ZeroCheck && Op->hasOneUse() &&
isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
@@ -14799,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
APInt Mask = ArithOp.getOpcode() == ISD::SRL
? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
- if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+ if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
break;
Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
DAG.getConstant(Mask, dl, VT));
@@ -14808,20 +16091,61 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::AND:
// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
- // because a TEST instruction will be better.
+ // because a TEST instruction will be better. However, AND should be
+ // preferred if the instruction can be combined into ANDN.
if (!hasNonFlagsUse(Op)) {
SDValue Op0 = ArithOp->getOperand(0);
SDValue Op1 = ArithOp->getOperand(1);
EVT VT = ArithOp.getValueType();
bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+ bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
+
+ // If we cannot select an ANDN instruction, check if we can replace
+ // AND+IMM64 with a shift before giving up. This is possible for masks
+ // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
+ if (!isProperAndn) {
+ if (!ZeroCheck)
+ break;
+
+ assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
+ auto *CN = dyn_cast<ConstantSDNode>(Op1);
+ if (!CN)
+ break;
+
+ const APInt &Mask = CN->getAPIntValue();
+ if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
+ break; // Prefer TEST instruction.
+
+ unsigned BitWidth = Mask.getBitWidth();
+ unsigned LeadingOnes = Mask.countLeadingOnes();
+ unsigned TrailingZeros = Mask.countTrailingZeros();
+
+ if (LeadingOnes + TrailingZeros == BitWidth) {
+ assert(TrailingZeros < VT.getSizeInBits() &&
+ "Shift amount should be less than the type width");
+ MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+ SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
+ Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
+ break;
+ }
+
+ unsigned LeadingZeros = Mask.countLeadingZeros();
+ unsigned TrailingOnes = Mask.countTrailingOnes();
+
+ if (LeadingZeros + TrailingOnes == BitWidth) {
+ assert(LeadingZeros < VT.getSizeInBits() &&
+ "Shift amount should be less than the type width");
+ MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+ SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
+ Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
+ break;
+ }
- // But if we can combine this into an ANDN operation, then create an AND
- // now and allow it to be pattern matched into an ANDN.
- if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
break;
+ }
}
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case ISD::SUB:
case ISD::OR:
case ISD::XOR:
@@ -14839,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
case ISD::OR: {
- if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ if (!NeedTruncation && ZeroCheck) {
if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
return EFLAGS;
}
@@ -14968,14 +16292,27 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // We never want to use both SQRT and RSQRT instructions for the same input.
+ if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+ return false;
+
+ if (VT.isVector())
+ return Subtarget.hasFastVectorFSQRT();
+ return Subtarget.hasFastScalarFSQRT();
+}
+
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
-SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
- DAGCombinerInfo &DCI,
- unsigned &RefinementSteps,
- bool &UseOneConstNR) const {
+SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
+ SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps,
+ bool &UseOneConstNR,
+ bool Reciprocal) const {
EVT VT = Op.getValueType();
- const char *RecipOp;
// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
// TODO: Add support for AVX512 (v16f32).
@@ -14984,30 +16321,24 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
- if (VT == MVT::f32 && Subtarget.hasSSE1())
- RecipOp = "sqrtf";
- else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
- (VT == MVT::v8f32 && Subtarget.hasAVX()))
- RecipOp = "vec-sqrtf";
- else
- return SDValue();
-
- TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
- if (!Recips.isEnabled(RecipOp))
- return SDValue();
+ if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+ if (RefinementSteps == ReciprocalEstimate::Unspecified)
+ RefinementSteps = 1;
- RefinementSteps = Recips.getRefinementSteps(RecipOp);
- UseOneConstNR = false;
- return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ UseOneConstNR = false;
+ return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
}
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
-SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
- DAGCombinerInfo &DCI,
- unsigned &RefinementSteps) const {
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
+ int Enabled,
+ int &RefinementSteps) const {
EVT VT = Op.getValueType();
- const char *RecipOp;
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
// TODO: Add support for AVX512 (v16f32).
@@ -15016,20 +16347,22 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
// 15 instructions: convert to single, rcpss, convert back to double, refine
// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
- if (VT == MVT::f32 && Subtarget.hasSSE1())
- RecipOp = "divf";
- else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
- (VT == MVT::v8f32 && Subtarget.hasAVX()))
- RecipOp = "vec-divf";
- else
- return SDValue();
- TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
- if (!Recips.isEnabled(RecipOp))
- return SDValue();
+ if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+ // Enable estimate codegen with 1 refinement step for vector division.
+ // Scalar division estimates are disabled because they break too much
+ // real-world code. These defaults are intended to match GCC behavior.
+ if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
+ return SDValue();
+
+ if (RefinementSteps == ReciprocalEstimate::Unspecified)
+ RefinementSteps = 1;
- RefinementSteps = Recips.getRefinementSteps(RecipOp);
- return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+ return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
}
/// If we have at least two divisions that use the same divisor, convert to
@@ -15042,9 +16375,46 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
return 2;
}
+/// Helper for creating a X86ISD::SETCC node.
+static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
+ SelectionDAG &DAG) {
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
+}
+
+/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
+/// according to equal/not-equal condition code \p CC.
+static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) {
+ // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i32 value is ok. We extend to i32 because
+ // the encoding for the i16 version is larger than the i32 version.
+ // Also promote i16 to i32 for performance / code size reason.
+ if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
+ Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
+
+ // See if we can use the 32-bit instruction instead of the 64-bit one for a
+ // shorter encoding. Since the former takes the modulo 32 of BitNo and the
+ // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
+ // known to be zero.
+ if (Src.getValueType() == MVT::i64 &&
+ DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
+ Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (Src.getValueType() != BitNo.getValueType())
+ BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
+
+ SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
+ X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ return getSETCC(Cond, BT, dl , DAG);
+}
+
/// Result of 'and' is compared against zero. Change to a BT node if possible.
-SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
- const SDLoc &dl, SelectionDAG &DAG) const {
+static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) {
SDValue Op0 = And.getOperand(0);
SDValue Op1 = And.getOperand(1);
if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -15087,27 +16457,35 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
}
}
- if (LHS.getNode()) {
- // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
- // instruction. Since the shift amount is in-range-or-undefined, we know
- // that doing a bittest on the i32 value is ok. We extend to i32 because
- // the encoding for the i16 version is larger than the i32 version.
- // Also promote i16 to i32 for performance / code size reason.
- if (LHS.getValueType() == MVT::i8 ||
- LHS.getValueType() == MVT::i16)
- LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+ if (LHS.getNode())
+ return getBitTestCondition(LHS, RHS, CC, dl, DAG);
- // If the operand types disagree, extend the shift amount to match. Since
- // BT ignores high bits (like shifts) we can use anyextend.
- if (LHS.getValueType() != RHS.getValueType())
- RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+ return SDValue();
+}
- SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
- X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(Cond, dl, MVT::i8), BT);
- }
+// Convert (truncate (srl X, N) to i1) to (bt X, N)
+static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) {
+
+ assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
+ "Expected TRUNCATE to i1 node");
+ if (Op.getOperand(0).getOpcode() != ISD::SRL)
+ return SDValue();
+
+ SDValue ShiftRight = Op.getOperand(0);
+ return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
+ CC, dl, DAG);
+}
+
+/// Result of 'and' or 'trunc to i1' is compared against zero.
+/// Change to a BT node if possible.
+SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ if (Op.getOpcode() == ISD::AND)
+ return LowerAndToBT(Op, CC, dl, DAG);
+ if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
+ return LowerTruncateToBT(Op, CC, dl, DAG);
return SDValue();
}
@@ -15132,19 +16510,19 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
case ISD::SETOEQ:
case ISD::SETEQ: SSECC = 0; break;
case ISD::SETOGT:
- case ISD::SETGT: Swap = true; // Fallthrough
+ case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETLT:
case ISD::SETOLT: SSECC = 1; break;
case ISD::SETOGE:
- case ISD::SETGE: Swap = true; // Fallthrough
+ case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETLE:
case ISD::SETOLE: SSECC = 2; break;
case ISD::SETUO: SSECC = 3; break;
case ISD::SETUNE:
case ISD::SETNE: SSECC = 4; break;
- case ISD::SETULE: Swap = true; // Fallthrough
+ case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETUGE: SSECC = 5; break;
- case ISD::SETULT: Swap = true; // Fallthrough
+ case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETUGT: SSECC = 6; break;
case ISD::SETO: SSECC = 7; break;
case ISD::SETUEQ:
@@ -15250,12 +16628,12 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
case ISD::SETNE: SSECC = 4; break;
case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
- case ISD::SETLT: Swap = true; //fall-through
+ case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
case ISD::SETULT: SSECC = 1; Unsigned = true; break;
case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
- case ISD::SETULE: Unsigned = true; //fall-through
+ case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
case ISD::SETLE: SSECC = 2; break;
}
@@ -15414,7 +16792,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// In this case use SSE compare
bool UseAVX512Inst =
(OpVT.is512BitVector() ||
- OpVT.getVectorElementType().getSizeInBits() >= 32 ||
+ OpVT.getScalarSizeInBits() >= 32 ||
(Subtarget.hasBWI() && Subtarget.hasVLX()));
if (UseAVX512Inst)
@@ -15638,15 +17016,12 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Lower (X & (1 << N)) == 0 to BT(X, N).
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
// Lower ((X >>s N) & 1) != 0 to BT(X, N).
- if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
- isNullConstant(Op1) &&
+ // Lower (trunc (X >> N) to i1) to BT(X, N).
+ if (Op0.hasOneUse() && isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
- if (VT == MVT::i1) {
- NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
- DAG.getValueType(MVT::i1));
+ if (VT == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
- }
return NewSetCC;
}
}
@@ -15665,14 +17040,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return Op0;
CCode = X86::GetOppositeBranchCondition(CCode);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(CCode, dl, MVT::i8),
- Op0.getOperand(1));
- if (VT == MVT::i1) {
- SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
- DAG.getValueType(MVT::i1));
+ SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
+ if (VT == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
- }
return SetCC;
}
}
@@ -15687,20 +17057,16 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
}
}
- bool isFP = Op1.getSimpleValueType().isFloatingPoint();
- unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
+ bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
+ X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
if (X86CC == X86::COND_INVALID)
return SDValue();
SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
- if (VT == MVT::i1) {
- SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
- DAG.getValueType(MVT::i1));
+ SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
+ if (VT == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
- }
return SetCC;
}
@@ -15717,34 +17083,23 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
assert(Carry.getOpcode() != ISD::CARRY_FALSE);
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
- if (Op.getSimpleValueType() == MVT::i1) {
- SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
- DAG.getValueType(MVT::i1));
+ SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
+ if (Op.getSimpleValueType() == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
- }
return SetCC;
}
/// Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
- unsigned Opc = Op.getNode()->getOpcode();
+ unsigned Opc = Op.getOpcode();
if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
Opc == X86ISD::SAHF)
return true;
if (Op.getResNo() == 1 &&
- (Opc == X86ISD::ADD ||
- Opc == X86ISD::SUB ||
- Opc == X86ISD::ADC ||
- Opc == X86ISD::SBB ||
- Opc == X86ISD::SMUL ||
- Opc == X86ISD::UMUL ||
- Opc == X86ISD::INC ||
- Opc == X86ISD::DEC ||
- Opc == X86ISD::OR ||
- Opc == X86ISD::XOR ||
- Opc == X86ISD::AND))
+ (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
+ Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
+ Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
+ Opc == X86ISD::XOR || Opc == X86ISD::AND))
return true;
if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
@@ -15753,27 +17108,18 @@ static bool isX86LogicalCmp(SDValue Op) {
return false;
}
-/// Returns the "condition" node, that may be wrapped with "truncate".
-/// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
-static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
if (V.getOpcode() != ISD::TRUNCATE)
- return V;
+ return false;
SDValue VOp0 = V.getOperand(0);
- if (VOp0.getOpcode() == ISD::AssertZext &&
- V.getValueSizeInBits() ==
- cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
- return VOp0.getOperand(0);
-
unsigned InBits = VOp0.getValueSizeInBits();
unsigned Bits = V.getValueSizeInBits();
- if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
- return V.getOperand(0);
- return V;
+ return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
}
SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
- bool addTest = true;
+ bool AddTest = true;
SDValue Cond = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
@@ -15794,9 +17140,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (SSECC != 8) {
if (Subtarget.hasAVX512()) {
- SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
- DAG.getConstant(SSECC, DL, MVT::i8));
- return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
+ CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
+ return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
+ DL, VT, Cmp, Op1, Op2);
}
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
@@ -15840,6 +17187,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
}
+ // AVX512 fallback is to lower selects of scalar floats to masked moves.
+ if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
+ Subtarget.hasAVX512())
+ return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
+
if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
SDValue Op1Scalar;
if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
@@ -15875,8 +17227,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
if (Cond.getOpcode() == ISD::SETCC) {
- if (SDValue NewCond = LowerSETCC(Cond, DAG))
+ if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
+ // If the condition was updated, it's possible that the operands of the
+ // select were also updated (for example, EmitTest has a RAUW). Refresh
+ // the local references to the select operands in case they got stale.
+ Op1 = Op.getOperand(1);
+ Op2 = Op.getOperand(2);
+ }
}
// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
@@ -15953,7 +17311,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
Opc == X86ISD::BT) { // FIXME
Cond = Cmp;
- addTest = false;
+ AddTest = false;
}
} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
@@ -15987,12 +17345,13 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Cond = X86Op.getValue(1);
CC = DAG.getConstant(X86Cond, DL, MVT::i8);
- addTest = false;
+ AddTest = false;
}
- if (addTest) {
+ if (AddTest) {
// Look past the truncate if the high bits are known zero.
- Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
// We know the result of AND is compared against zero. Try to match
// it to BT.
@@ -16000,12 +17359,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
CC = NewSetCC.getOperand(0);
Cond = NewSetCC.getOperand(1);
- addTest = false;
+ AddTest = false;
}
}
}
- if (addTest) {
+ if (AddTest) {
CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
}
@@ -16077,34 +17436,44 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
VTElt.getSizeInBits() >= 32))))
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
- unsigned int NumElts = VT.getVectorNumElements();
-
- if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
- return SDValue();
+ unsigned NumElts = VT.getVectorNumElements();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+ if (VT.is512BitVector() && InVTElt != MVT::i1 &&
+ (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
}
- assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
- MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
- SDValue NegOne =
- DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
- ExtVT);
- SDValue Zero =
- DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+ if (InVTElt != MVT::i1)
+ return SDValue();
+
+ MVT ExtVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
+ SDValue V;
+ if (Subtarget.hasDQI()) {
+ V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+ assert(!VT.is512BitVector() && "Unexpected vector type");
+ } else {
+ SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+ SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
+ V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ if (ExtVT == VT)
+ return V;
+ }
- SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
- if (VT.is512BitVector())
- return V;
return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
}
-static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
+// For sign extend this needs to handle all vector sizes and SSE4.1 and
+// non-SSE4.1 targets. For zero extend this should only handle inputs of
+// MVT::v64i8 when BWI is not supported, but AVX512 is.
+static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
MVT VT = Op->getSimpleValueType(0);
MVT InVT = In.getSimpleValueType();
@@ -16119,20 +17488,33 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
return SDValue();
if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
- !(VT.is256BitVector() && Subtarget.hasInt256()))
+ !(VT.is256BitVector() && Subtarget.hasInt256()) &&
+ !(VT.is512BitVector() && Subtarget.hasAVX512()))
return SDValue();
SDLoc dl(Op);
// For 256-bit vectors, we only need the lower (128-bit) half of the input.
- if (VT.is256BitVector())
- In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
- MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
- In, DAG.getIntPtrConstant(0, dl));
+ // For 512-bit vectors, we need 128-bits or 256-bits.
+ if (VT.getSizeInBits() > 128) {
+ // Input needs to be at least the same number of elements as output, and
+ // at least 128-bits.
+ int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
+ In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
+ }
+
+ assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
+ InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
// SSE41 targets can use the pmovsx* instructions directly.
+ unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+ X86ISD::VSEXT : X86ISD::VZEXT;
if (Subtarget.hasSSE41())
- return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+ return DAG.getNode(ExtOpc, dl, VT, In);
+
+ // We should only get here for sign extend.
+ assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
+ "Unexpected opcode!");
// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
SDValue Curr = In;
@@ -16150,7 +17532,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
SDValue SignExt = Curr;
if (CurrVT != InVT) {
unsigned SignExtShift =
- CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
+ CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
DAG.getConstant(SignExtShift, dl, MVT::i8));
}
@@ -16211,7 +17593,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements()/2);
+ VT.getVectorNumElements() / 2);
OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
@@ -16643,7 +18025,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
case X86::COND_B:
// These can only come from an arithmetic instruction with overflow,
// e.g. SADDO, UADDO.
- Cond = Cond.getNode()->getOperand(1);
+ Cond = Cond.getOperand(1);
addTest = false;
break;
}
@@ -16828,11 +18210,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
// Look pass the truncate if the high bits are known zero.
- Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
- // We know the result of AND is compared against zero. Try to match
- // it to BT.
- if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ // We know the result is compared against zero. Try to match it to BT.
+ if (Cond.hasOneUse()) {
if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
CC = NewSetCC.getOperand(0);
Cond = NewSetCC.getOperand(1);
@@ -17000,7 +18382,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget.is64Bit() &&
"LowerVAARG only handles 64-bit va_arg!");
- assert(Op.getNode()->getNumOperands() == 4);
+ assert(Op.getNumOperands() == 4);
MachineFunction &MF = DAG.getMachineFunction();
if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
@@ -17161,6 +18543,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
/// constant. Takes immediate version of shift as input.
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue SrcOp, SDValue ShAmt,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT SVT = ShAmt.getSimpleValueType();
assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
@@ -17178,27 +18561,32 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
}
- const X86Subtarget &Subtarget =
- static_cast<const X86Subtarget &>(DAG.getSubtarget());
- if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
- ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
- // Let the shuffle legalizer expand this shift amount node.
- SDValue Op0 = ShAmt.getOperand(0);
- Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
- ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
+ // Need to build a vector containing shift amount.
+ // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+ // +=================+============+=======================================+
+ // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
+ // +=================+============+=======================================+
+ // | i64 | Yes, No | Use ShAmt as lowest elt |
+ // | i32 | Yes | zero-extend in-reg |
+ // | (i32 zext(i16)) | Yes | zero-extend in-reg |
+ // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
+ // +=================+============+=======================================+
+
+ if (SVT == MVT::i64)
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+ else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+ ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+ ShAmt = ShAmt.getOperand(0);
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
+ ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+ } else if (Subtarget.hasSSE41() &&
+ ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
+ ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
} else {
- // Need to build a vector containing shift amount.
- // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
- SmallVector<SDValue, 4> ShOps;
- ShOps.push_back(ShAmt);
- if (SVT == MVT::i32) {
- ShOps.push_back(DAG.getConstant(0, dl, SVT));
- ShOps.push_back(DAG.getUNDEF(SVT));
- }
- ShOps.push_back(DAG.getUNDEF(SVT));
-
- MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
- ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
+ SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
+ DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
+ ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
}
// The return type has to be a 128-bit type with the same element
@@ -17290,7 +18678,7 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
case X86ISD::VTRUNC:
case X86ISD::VTRUNCS:
case X86ISD::VTRUNCUS:
- case ISD::FP_TO_FP16:
+ case X86ISD::CVTPS2PH:
// We can't use ISD::VSELECT here because it is not always "Legal"
// for the destination type. For example vpmovqb require only AVX512
// and vselect that can operate on byte element type require BWI
@@ -17321,7 +18709,8 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
// The mask should be of type MVT::i1
SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
- if (Op.getOpcode() == X86ISD::FSETCC)
+ if (Op.getOpcode() == X86ISD::FSETCCM ||
+ Op.getOpcode() == X86ISD::FSETCCM_RND)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
if (Op.getOpcode() == X86ISD::VFPCLASS ||
Op.getOpcode() == X86ISD::VFPCLASSS)
@@ -17329,7 +18718,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+ return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
}
static int getSEHRegistrationNodeSize(const Function *Fn) {
@@ -17395,6 +18784,15 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ // Helper to detect if the operand is CUR_DIRECTION rounding mode.
+ auto isRoundModeCurDirection = [](SDValue Rnd) {
+ if (!isa<ConstantSDNode>(Rnd))
+ return false;
+
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ };
+
SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
MVT VT = Op.getSimpleValueType();
@@ -17406,9 +18804,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
case INTR_TYPE_2OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
- case INTR_TYPE_2OP_IMM8:
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
- DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
case INTR_TYPE_3OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
@@ -17420,7 +18815,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue RoundingMode;
- // We allways add rounding mode to the Node.
+ // We always add rounding mode to the Node.
// If the rounding mode is not specified, we add the
// "current direction" mode.
if (Op.getNumOperands() == 4)
@@ -17428,13 +18823,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
else
RoundingMode = Op.getOperand(4);
- unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0)
- if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
- X86::STATIC_ROUNDING::CUR_DIRECTION)
- return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, Op.getValueType(), Src, RoundingMode),
- Mask, PassThru, Subtarget, DAG);
+ assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
RoundingMode),
Mask, PassThru, Subtarget, DAG);
@@ -17449,8 +18838,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
- unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
- if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ if (!isRoundModeCurDirection(Rnd)) {
return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
dl, Op.getValueType(),
Src, Rnd),
@@ -17478,8 +18866,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
// (2) With rounding mode and sae - 7 operands.
if (Op.getNumOperands() == 6) {
SDValue Sae = Op.getOperand(5);
- unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
- return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
Sae),
Mask, Src0, Subtarget, DAG);
}
@@ -17506,8 +18893,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(5);
- unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
- if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ if (!isRoundModeCurDirection(Rnd)) {
return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
dl, Op.getValueType(),
Src1, Src2, Rnd),
@@ -17564,12 +18950,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
else
Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- Src1, Src2, Imm, Rnd),
- Mask, PassThru, Subtarget, DAG);
+ Src1, Src2, Imm, Rnd),
+ Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_3OP_IMM8_MASK:
- case INTR_TYPE_3OP_MASK:
- case INSERT_SUBVEC: {
+ case INTR_TYPE_3OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
@@ -17578,13 +18963,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
- else if (IntrData->Type == INSERT_SUBVEC) {
- // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
- assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
- unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
- Imm *= Src2.getSimpleValueType().getVectorNumElements();
- Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
- }
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
@@ -17592,8 +18970,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(6);
- unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
- if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ if (!isRoundModeCurDirection(Rnd)) {
return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
dl, Op.getValueType(),
Src1, Src2, Src3, Rnd),
@@ -17616,19 +18993,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
}
case VPERM_3OP_MASKZ:
case VPERM_3OP_MASK:{
+ MVT VT = Op.getSimpleValueType();
// Src2 is the PassThru
SDValue Src1 = Op.getOperand(1);
- SDValue Src2 = Op.getOperand(2);
+ // PassThru needs to be the same type as the destination in order
+ // to pattern match correctly.
+ SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
SDValue Src3 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- MVT VT = Op.getSimpleValueType();
SDValue PassThru = SDValue();
// set PassThru element
if (IntrData->Type == VPERM_3OP_MASKZ)
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
else
- PassThru = DAG.getBitcast(VT, Src2);
+ PassThru = Src2;
// Swap Src1 and Src2 in the node creation
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
@@ -17660,8 +19039,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(5);
- if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
- X86::STATIC_ROUNDING::CUR_DIRECTION)
+ if (!isRoundModeCurDirection(Rnd))
return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
dl, Op.getValueType(),
Src1, Src2, Src3, Rnd),
@@ -17713,6 +19091,35 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Src1, Src2, Src3, Src4),
Mask, PassThru, Subtarget, DAG);
}
+ case CVTPD2PS:
+ // ISD::FP_ROUND has a second argument that indicates if the truncation
+ // does not change the value. Set it to 0 since it can change.
+ return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+ DAG.getIntPtrConstant(0, dl));
+ case CVTPD2PS_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue PassThru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ // We add rounding mode to the Node when
+ // - RM Opcode is specified and
+ // - RM is not "current direction".
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(4);
+ if (!isRoundModeCurDirection(Rnd)) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
+ // ISD::FP_ROUND has a second argument that indicates if the truncation
+ // does not change the value. Set it to 0 since it can change.
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+ DAG.getIntPtrConstant(0, dl)),
+ Mask, PassThru, Subtarget, DAG);
+ }
case FPCLASS: {
// FPclass intrinsics with mask
SDValue Src1 = Op.getOperand(1);
@@ -17738,7 +19145,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
- return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
}
case CMP_MASK:
case CMP_MASK_CC: {
@@ -17765,8 +19172,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
- if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
- X86::STATIC_ROUNDING::CUR_DIRECTION)
+ if (!isRoundModeCurDirection(Rnd))
Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC, Rnd);
}
@@ -17798,8 +19204,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Cmp;
if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
- if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
- X86::STATIC_ROUNDING::CUR_DIRECTION)
+ if (!isRoundModeCurDirection(Rnd))
Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
}
//default rounding mode
@@ -17822,39 +19227,29 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue SetCC;
switch (CC) {
case ISD::SETEQ: { // (ZF = 0 and PF = 0)
- SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
- SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_NP, dl, MVT::i8),
- Comi);
+ SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
+ SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
break;
}
case ISD::SETNE: { // (ZF = 1 or PF = 1)
- SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
- SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_P, dl, MVT::i8),
- Comi);
+ SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
+ SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
break;
}
case ISD::SETGT: // (CF = 0 and ZF = 0)
- SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
+ SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
break;
case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
- SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
+ SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
break;
}
case ISD::SETGE: // CF = 0
- SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
+ SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
break;
case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
- SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
+ SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
break;
default:
llvm_unreachable("Unexpected illegal condition!");
@@ -17868,19 +19263,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Sae = Op.getOperand(4);
SDValue FCmp;
- if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
- X86::STATIC_ROUNDING::CUR_DIRECTION)
- FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
+ if (isRoundModeCurDirection(Sae))
+ FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8));
else
- FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
+ FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8), Sae);
// AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
}
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
- Op.getOperand(1), Op.getOperand(2), DAG);
+ Op.getOperand(1), Op.getOperand(2), Subtarget,
+ DAG);
case COMPRESS_EXPAND_IN_REG: {
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
@@ -18027,14 +19422,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
case Intrinsic::x86_avx_vtestc_pd_256:
case Intrinsic::x86_avx_vtestnzc_pd_256: {
bool IsTestPacked = false;
- unsigned X86CC;
+ X86::CondCode X86CC;
switch (IntNo) {
default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
case Intrinsic::x86_avx_vtestz_ps:
case Intrinsic::x86_avx_vtestz_pd:
case Intrinsic::x86_avx_vtestz_ps_256:
case Intrinsic::x86_avx_vtestz_pd_256:
- IsTestPacked = true; // Fallthrough
+ IsTestPacked = true;
+ LLVM_FALLTHROUGH;
case Intrinsic::x86_sse41_ptestz:
case Intrinsic::x86_avx_ptestz_256:
// ZF = 1
@@ -18044,7 +19440,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
case Intrinsic::x86_avx_vtestc_pd:
case Intrinsic::x86_avx_vtestc_ps_256:
case Intrinsic::x86_avx_vtestc_pd_256:
- IsTestPacked = true; // Fallthrough
+ IsTestPacked = true;
+ LLVM_FALLTHROUGH;
case Intrinsic::x86_sse41_ptestc:
case Intrinsic::x86_avx_ptestc_256:
// CF = 1
@@ -18054,7 +19451,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
case Intrinsic::x86_avx_vtestnzc_pd:
case Intrinsic::x86_avx_vtestnzc_ps_256:
case Intrinsic::x86_avx_vtestnzc_pd_256:
- IsTestPacked = true; // Fallthrough
+ IsTestPacked = true;
+ LLVM_FALLTHROUGH;
case Intrinsic::x86_sse41_ptestnzc:
case Intrinsic::x86_avx_ptestnzc_256:
// ZF and CF = 0
@@ -18066,18 +19464,17 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue RHS = Op.getOperand(2);
unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
- SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+ SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case Intrinsic::x86_avx512_kortestz_w:
case Intrinsic::x86_avx512_kortestc_w: {
- unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
+ X86::CondCode X86CC =
+ (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
- SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+ SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
@@ -18092,7 +19489,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
case Intrinsic::x86_sse42_pcmpistriz128:
case Intrinsic::x86_sse42_pcmpestriz128: {
unsigned Opcode;
- unsigned X86CC;
+ X86::CondCode X86CC;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse42_pcmpistria128:
@@ -18139,9 +19536,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86CC, dl, MVT::i8),
- SDValue(PCMP.getNode(), 1));
+ SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
@@ -18267,6 +19662,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
return SDValue(Res, 0);
}
+/// Handles the lowering of builtin intrinsic that return the value
+/// of the extended control register.
+static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
+ assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue LO, HI;
+
+ // The ECX register is used to select the index of the XCR register to
+ // return.
+ SDValue Chain =
+ DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
+ SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
+ Chain = SDValue(N1, 0);
+
+ // Reads the content of XCR and returns it in registers EDX:EAX.
+ if (Subtarget.is64Bit()) {
+ LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+ LO.getValue(2));
+ } else {
+ LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
+ HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+ LO.getValue(2));
+ }
+ Chain = HI.getValue(1);
+
+ if (Subtarget.is64Bit()) {
+ // Merge the two 32-bit values into a 64-bit one..
+ SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+ DAG.getConstant(32, DL, MVT::i8));
+ Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { LO, HI };
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+ Results.push_back(Pair);
+ Results.push_back(Chain);
+}
+
/// Handles the lowering of builtin intrinsics that read performance monitor
/// counters (x86_rdpmc).
static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
@@ -18413,6 +19853,33 @@ static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
return Chain;
}
+/// Emit Truncating Store with signed or unsigned saturation.
+static SDValue
+EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
+ SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
+ SelectionDAG &DAG) {
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
+ SDValue Ops[] = { Chain, Val, Ptr, Undef };
+ return SignedSat ?
+ DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
+ DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+}
+
+/// Emit Masked Truncating Store with signed or unsigned saturation.
+static SDValue
+EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
+ SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
+ MachineMemOperand *MMO, SelectionDAG &DAG) {
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = { Chain, Ptr, Mask, Val };
+ return SignedSat ?
+ DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
+ DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+}
+
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -18429,8 +19896,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
IntNo == llvm::Intrinsic::x86_flags_write_u64) {
// We need a frame pointer because this will get lowered to a PUSH/POP
// sequence.
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- MFI->setHasCopyImplyingStackAdjustment(true);
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setHasCopyImplyingStackAdjustment(true);
// Don't do anything here, we will expand these intrinsics out later
// during ExpandISelPseudos in EmitInstrWithCustomInserter.
return SDValue();
@@ -18509,13 +19976,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
return DAG.getMergeValues(Results, dl);
}
+ // Get Extended Control Register.
+ case XGETBV: {
+ SmallVector<SDValue, 2> Results;
+ getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
+ return DAG.getMergeValues(Results, dl);
+ }
// XTEST intrinsics.
case XTEST: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_NE, dl, MVT::i8),
- InTrans);
+
+ SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
Ret, SDValue(InTrans.getNode(), 1));
@@ -18530,9 +20002,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
Op.getOperand(4), GenCF.getValue(1));
SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
Op.getOperand(5), MachinePointerInfo());
- SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_B, dl, MVT::i8),
- Res.getValue(1));
+ SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
SDValue Results[] = { SetCC, Store };
return DAG.getMergeValues(Results, dl);
}
@@ -18550,11 +20020,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getStore(Chain, dl, DataToCompress, Addr,
MemIntr->getMemOperand());
- SDValue Compressed =
- getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
- Mask, DAG.getUNDEF(VT), Subtarget, DAG);
- return DAG.getStore(Chain, dl, Compressed, Addr,
- MemIntr->getMemOperand());
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
+ MemIntr->getMemOperand(),
+ false /* truncating */, true /* compressing */);
}
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
@@ -18567,18 +20038,39 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
assert(MemIntr && "Expected MemIntrinsicSDNode!");
- EVT VT = MemIntr->getMemoryVT();
+ EVT MemVT = MemIntr->getMemoryVT();
- if (isAllOnesConstant(Mask)) // return just a truncate store
- return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
- MemIntr->getMemOperand());
+ uint16_t TruncationOp = IntrData->Opc0;
+ switch (TruncationOp) {
+ case X86ISD::VTRUNC: {
+ if (isAllOnesConstant(Mask)) // return just a truncate store
+ return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
+ MemIntr->getMemOperand());
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
- MemIntr->getMemOperand(), true);
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
+ MemIntr->getMemOperand(), true /* truncating */);
+ }
+ case X86ISD::VTRUNCUS:
+ case X86ISD::VTRUNCS: {
+ bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
+ if (isAllOnesConstant(Mask))
+ return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
+ MemIntr->getMemOperand(), DAG);
+
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+ return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
+ VMask, MemVT, MemIntr->getMemOperand(), DAG);
+ }
+ default:
+ llvm_unreachable("Unsupported truncstore intrinsic");
+ }
}
+
case EXPAND_FROM_MEM: {
SDValue Mask = Op.getOperand(4);
SDValue PassThru = Op.getOperand(3);
@@ -18589,24 +20081,24 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
assert(MemIntr && "Expected MemIntrinsicSDNode!");
- SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
- MemIntr->getMemOperand());
+ if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
+ return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
+ if (X86::isZeroNode(Mask))
+ return DAG.getUNDEF(VT);
- if (isAllOnesConstant(Mask)) // return just a load
- return DataToExpand;
-
- SDValue Results[] = {
- getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
- Mask, PassThru, Subtarget, DAG), Chain};
- return DAG.getMergeValues(Results, dl);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
+ MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
+ true /* expanding */);
}
}
}
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- MFI->setReturnAddressIsTaken(true);
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI.setReturnAddressIsTaken(true);
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
@@ -18630,14 +20122,20 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
MachinePointerInfo());
}
+SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
+ return getReturnAddressFrameIndex(DAG);
+}
+
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
EVT VT = Op.getValueType();
- MFI->setFrameAddressIsTaken(true);
+ MFI.setFrameAddressIsTaken(true);
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
// Depth > 0 makes no sense on targets which use Windows unwind codes. It
@@ -18647,7 +20145,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
if (!FrameAddrIndex) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
- FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
+ FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
FuncInfo->setFAIndex(FrameAddrIndex);
}
@@ -18965,7 +20463,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SDLoc DL(Op);
// Save FP Control Word to stack slot
- int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
+ int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
SDValue StackSlot =
DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
@@ -19083,7 +20581,7 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
SmallVector<SDValue, 64> LUTVec;
for (int i = 0; i < NumBytes; ++i)
LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
- SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
+ SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
// Begin by bitcasting the input to byte vector, then split those bytes
// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
@@ -19444,43 +20942,63 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
+ // 32-bit vector types used for MULDQ/MULUDQ.
+ MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
+
+ // MULDQ returns the 64-bit result of the signed multiplication of the lower
+ // 32-bits. We can lower with this if the sign bits stretch that far.
+ if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
+ DAG.ComputeNumSignBits(B) > 32) {
+ return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
+ DAG.getBitcast(MulVT, B));
+ }
+
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
//
// AloBlo = pmuludq(a, b);
// AloBhi = pmuludq(a, Bhi);
// AhiBlo = pmuludq(Ahi, b);
+ //
+ // Hi = psllqi(AloBhi + AhiBlo, 32);
+ // return AloBlo + Hi;
+ APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
+ bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
+ bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
+
+ APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
+ bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
+ bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
- // AloBhi = psllqi(AloBhi, 32);
- // AhiBlo = psllqi(AhiBlo, 32);
- // return AloBlo + AloBhi + AhiBlo;
+ // Bit cast to 32-bit vectors for MULUDQ.
+ SDValue Alo = DAG.getBitcast(MulVT, A);
+ SDValue Blo = DAG.getBitcast(MulVT, B);
- SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
- SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
+ SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
- SDValue AhiBlo = Ahi;
- SDValue AloBhi = Bhi;
- // Bit cast to 32-bit vectors for MULUDQ
- MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
- (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
- A = DAG.getBitcast(MulVT, A);
- B = DAG.getBitcast(MulVT, B);
- Ahi = DAG.getBitcast(MulVT, Ahi);
- Bhi = DAG.getBitcast(MulVT, Bhi);
+ // Only multiply lo/hi halves that aren't known to be zero.
+ SDValue AloBlo = Zero;
+ if (!ALoIsZero && !BLoIsZero)
+ AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
- SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
- // After shifting right const values the result may be all-zero.
- if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
- AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
- AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
+ SDValue AloBhi = Zero;
+ if (!ALoIsZero && !BHiIsZero) {
+ SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
+ Bhi = DAG.getBitcast(MulVT, Bhi);
+ AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
}
- if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
- AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
- AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+
+ SDValue AhiBlo = Zero;
+ if (!AHiIsZero && !BLoIsZero) {
+ SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+ Ahi = DAG.getBitcast(MulVT, Ahi);
+ AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
}
- SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
- return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+ SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
+ Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
+
+ return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -19905,7 +21423,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
- (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
+ (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
+ (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
// Peek through any splat that was introduced for i64 shift vectorization.
int SplatIndex = -1;
@@ -20018,7 +21537,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
else if (EltVT.bitsLT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
- return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
+ return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
}
}
@@ -20147,7 +21666,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
// If possible, lower this shift as a sequence of two shifts by
- // constant plus a MOVSS/MOVSD instead of scalarizing it.
+ // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
// Example:
// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
//
@@ -20167,7 +21686,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
// See if it is possible to replace this node with a sequence of
- // two shifts followed by a MOVSS/MOVSD
+ // two shifts followed by a MOVSS/MOVSD/PBLEND.
if (VT == MVT::v4i32) {
// Check if it is legal to use a MOVSS.
CanBeSimplified = Amt2 == Amt->getOperand(2) &&
@@ -20199,21 +21718,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
isa<ConstantSDNode>(Amt2)) {
- // Replace this node with two shifts followed by a MOVSS/MOVSD.
+ // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
MVT CastVT = MVT::v4i32;
SDValue Splat1 =
- DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
+ DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
SDValue Splat2 =
- DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
+ DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
- if (TargetOpcode == X86ISD::MOVSD)
- CastVT = MVT::v2i64;
SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
- SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
- BitCast1, DAG);
- return DAG.getBitcast(VT, Result);
+ if (TargetOpcode == X86ISD::MOVSD)
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
+ BitCast2, {0, 1, 6, 7}));
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
+ BitCast2, {0, 5, 6, 7}));
}
}
@@ -20264,15 +21783,44 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
}
+ // It's worth extending once and using the vXi16/vXi32 shifts for smaller
+ // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
+ // make the existing SSE solution better.
+ if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
+ (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
+ (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
+ (Subtarget.hasBWI() && VT == MVT::v32i8)) {
+ MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
+ MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
+ unsigned ExtOpc =
+ Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+ Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+ }
+
if (VT == MVT::v16i8 ||
- (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
unsigned ShiftOpcode = Op->getOpcode();
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
- if (Subtarget.hasSSE41()) {
+ if (VT.is512BitVector()) {
+ // On AVX512BW targets we make use of the fact that VSELECT lowers
+ // to a masked blend which selects bytes based just on the sign bit
+ // extracted to a mask.
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ V0 = DAG.getBitcast(VT, V0);
+ V1 = DAG.getBitcast(VT, V1);
+ Sel = DAG.getBitcast(VT, Sel);
+ Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ } else if (Subtarget.hasSSE41()) {
+ // On SSE41 targets we make use of the fact that VSELECT lowers
+ // to PBLENDVB which selects bytes based just on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
@@ -20372,19 +21920,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
}
- // It's worth extending once and using the v8i32 shifts for 16-bit types, but
- // the extra overheads to get from v16i8 to v8i32 make the existing SSE
- // solution better.
- if (Subtarget.hasInt256() && VT == MVT::v8i16) {
- MVT ExtVT = MVT::v8i32;
- unsigned ExtOpc =
- Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- R = DAG.getNode(ExtOpc, dl, ExtVT, R);
- Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
- return DAG.getNode(ISD::TRUNCATE, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
- }
-
if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
MVT ExtVT = MVT::v8i32;
SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
@@ -20519,7 +22054,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
unsigned BaseOp = 0;
- unsigned Cond = 0;
+ X86::CondCode Cond;
SDLoc DL(Op);
switch (Op.getOpcode()) {
default: llvm_unreachable("Unknown ovf instruction!");
@@ -20567,16 +22102,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
MVT::i32);
SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
- SDValue SetCC =
- DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(X86::COND_O, DL, MVT::i32),
- SDValue(Sum.getNode(), 2));
+ SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
- if (N->getValueType(1) == MVT::i1) {
- SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
- DAG.getValueType(MVT::i1));
+ if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
- }
+
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
}
@@ -20585,16 +22115,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
- SDValue SetCC =
- DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(Cond, DL, MVT::i32),
- SDValue(Sum.getNode(), 1));
+ SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
- if (N->getValueType(1) == MVT::i1) {
- SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
- DAG.getValueType(MVT::i1));
+ if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
- }
+
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
@@ -20790,9 +22315,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
MVT::i32, cpOut.getValue(2));
- SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
- DAG.getConstant(X86::COND_E, DL, MVT::i8),
- EFLAGS);
+ SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
@@ -20898,8 +22421,9 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
// two v2i64 vectors which concatenated are the 4 population counts. We can
// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
- SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
- SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
+ SDValue V32 = DAG.getBitcast(VT, V);
+ SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
+ SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
// Do the horizontal sums into two v2i64s.
Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
@@ -21054,6 +22578,8 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
DAG);
}
+// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
+// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
@@ -21260,8 +22786,7 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
- RHS, AN->getMemOperand(), AN->getOrdering(),
- AN->getSynchScope());
+ RHS, AN->getMemOperand());
}
assert(Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!");
@@ -21292,9 +22817,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
cast<AtomicSDNode>(Node)->getMemoryVT(),
Node->getOperand(0),
Node->getOperand(1), Node->getOperand(2),
- cast<AtomicSDNode>(Node)->getMemOperand(),
- cast<AtomicSDNode>(Node)->getOrdering(),
- cast<AtomicSDNode>(Node)->getSynchScope());
+ cast<AtomicSDNode>(Node)->getMemOperand());
return Swap.getValue(1);
}
// Other atomic stores have a simple pattern.
@@ -21534,26 +23057,48 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
SDValue Mask = N->getMask();
SDLoc dl(Op);
+ assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
+ "Expanding masked load is supported on AVX-512 target only!");
+
+ assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
+ "Expanding masked load is supported for 32 and 64-bit types only!");
+
+ // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
+ // VLX. These types for exp-loads are handled here.
+ if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
+ return Op;
+
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.");
- assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
+ assert((ScalarVT.getSizeInBits() >= 32 ||
(Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
"Unsupported masked load op.");
// This operation is legal for targets with VLX, but without
// VLX the vector should be widened to 512 bit
- unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
- MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
SDValue Src0 = N->getSrc0();
Src0 = ExtendToType(Src0, WideDataVT, DAG);
+
+ // Mask element has to be i1.
+ MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
+ assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
+ "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+
+ MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ if (MaskEltTy != MVT::i1)
+ Mask = DAG.getNode(ISD::TRUNCATE, dl,
+ MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
N->getBasePtr(), Mask, Src0,
N->getMemoryVT(), N->getMemOperand(),
- N->getExtensionType());
+ N->getExtensionType(),
+ N->isExpandingLoad());
SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
NewLoad.getValue(0),
@@ -21571,10 +23116,20 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
SDValue Mask = N->getMask();
SDLoc dl(Op);
+ assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
+ "Expanding masked load is supported on AVX-512 target only!");
+
+ assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
+ "Expanding masked load is supported for 32 and 64-bit types only!");
+
+ // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
+ if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
+ return Op;
+
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.");
- assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
+ assert((ScalarVT.getSizeInBits() >= 32 ||
(Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
"Unsupported masked store op.");
@@ -21583,12 +23138,22 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
// VLX the vector should be widened to 512 bit
unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
- MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+
+ // Mask element has to be i1.
+ MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
+ assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
+ "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+
+ MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+
DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ if (MaskEltTy != MVT::i1)
+ Mask = DAG.getNode(ISD::TRUNCATE, dl,
+ MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
Mask, N->getMemoryVT(), N->getMemOperand(),
- N->isTruncatingStore());
+ N->isTruncatingStore(), N->isCompressingStore());
}
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
@@ -21734,10 +23299,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
- return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
- case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
- case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
case ISD::FABS:
@@ -21756,6 +23322,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::FRAME_TO_ARGS_OFFSET:
return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
@@ -21830,7 +23397,7 @@ void X86TargetLowering::LowerOperationWrapper(SDNode *N,
// In some cases (LowerSINT_TO_FP for example) Res has more result values
// than original node, chain should be dropped(last value).
for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
- Results.push_back(Res.getValue(I));
+ Results.push_back(Res.getValue(I));
}
/// Replace a node with an illegal result type with a new node built out of
@@ -21851,9 +23418,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
auto InVTSize = InVT.getSizeInBits();
const unsigned RegSize =
(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
- assert((!Subtarget.hasAVX512() || RegSize < 512) &&
- "512-bit vector requires AVX512");
- assert((!Subtarget.hasAVX2() || RegSize < 256) &&
+ assert((Subtarget.hasBWI() || RegSize < 512) &&
+ "512-bit vector requires AVX512BW");
+ assert((Subtarget.hasAVX2() || RegSize < 256) &&
"256-bit vector requires AVX2");
auto ElemVT = InVT.getVectorElementType();
@@ -21888,13 +23455,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
return;
}
- case ISD::SIGN_EXTEND_INREG:
- case ISD::ADDC:
- case ISD::ADDE:
- case ISD::SUBC:
- case ISD::SUBE:
- // We don't want to expand or promote these.
- return;
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
@@ -21909,6 +23469,36 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+ if (N->getValueType(0) == MVT::v2i32) {
+ assert((IsSigned || Subtarget.hasAVX512()) &&
+ "Can only handle signed conversion without AVX512");
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ SDValue Src = N->getOperand(0);
+ if (Src.getValueType() == MVT::v2f64) {
+ SDValue Idx = DAG.getIntPtrConstant(0, dl);
+ SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
+ : X86ISD::CVTTP2UI,
+ dl, MVT::v4i32, Src);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
+ Results.push_back(Res);
+ return;
+ }
+ if (Src.getValueType() == MVT::v2f32) {
+ SDValue Idx = DAG.getIntPtrConstant(0, dl);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32));
+ Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
+ : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
+ Results.push_back(Res);
+ return;
+ }
+
+ // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
+ // so early out here.
+ return;
+ }
+
std::pair<SDValue,SDValue> Vals =
FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
SDValue FIST = Vals.first, StackSlot = Vals.second;
@@ -21923,13 +23513,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case ISD::SINT_TO_FP: {
+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
+ SDValue Src = N->getOperand(0);
+ if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
+ return;
+ Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
+ return;
+ }
case ISD::UINT_TO_FP: {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
- if (N->getOperand(0).getValueType() != MVT::v2i32 ||
- N->getValueType(0) != MVT::v2f32)
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2f32)
return;
- SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
- N->getOperand(0));
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
+ Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
+ return;
+ }
+ if (SrcVT != MVT::v2i32)
+ return;
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
SDValue VBias =
DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
@@ -21967,6 +23572,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results);
case Intrinsic::x86_rdpmc:
return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
+
+ case Intrinsic::x86_xgetbv:
+ return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
}
}
case ISD::INTRINSIC_WO_CHAIN: {
@@ -22052,9 +23660,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
MVT::i32, cpOutH.getValue(2));
- SDValue Success =
- DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
+ SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
@@ -22143,6 +23749,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
case X86ISD::FSETCC: return "X86ISD::FSETCC";
+ case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
+ case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
@@ -22215,11 +23823,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
+ case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
+ case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
+ case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
+ case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
case X86ISD::VINSERT: return "X86ISD::VINSERT";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
+ case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
+ case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
- case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD";
- case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD";
+ case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
+ case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
@@ -22332,27 +23946,43 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
+ case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
+ case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
+ case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
+ case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
+ case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
+ case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
+ case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
+ case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
+ case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
+ case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
case X86ISD::SELECT: return "X86ISD::SELECT";
+ case X86ISD::SELECTS: return "X86ISD::SELECTS";
case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RCP28S: return "X86ISD::RCP28S";
case X86ISD::EXP2: return "X86ISD::EXP2";
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
+ case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
+ case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
case X86ISD::SCALEF: return "X86ISD::SCALEF";
case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
case X86ISD::ADDS: return "X86ISD::ADDS";
@@ -22361,13 +23991,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::MULHRS: return "X86ISD::MULHRS";
case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
- case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND";
- case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND";
+ case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
+ case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
+ case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
+ case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
+ case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
+ case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
+ case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
+ case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
- case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
- case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
+ case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
+ case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
+ case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
+ case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
+ case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
+ case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
+ case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
+ case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
+ case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
+ case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
}
return nullptr;
}
@@ -24031,11 +25675,10 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *BB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
- MachineModuleInfo *MMI = &MF->getMMI();
- MachineFrameInfo *MFI = MF->getFrameInfo();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- int FI = MFI->getFunctionContextIndex();
+ int FI = MFI.getFunctionContextIndex();
// Get a mapping of the call site numbers to all of the landing pads they're
// associated with.
@@ -24055,10 +25698,10 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
break;
}
- if (!MMI->hasCallSiteLandingPad(Sym))
+ if (!MF->hasCallSiteLandingPad(Sym))
continue;
- for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
+ for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
CallSiteNumToLPad[CSI].push_back(&MBB);
MaxCSNum = std::max(MaxCSNum, CSI);
}
@@ -24208,173 +25851,18 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
return BB;
}
-// Replace 213-type (isel default) FMA3 instructions with 231-type for
-// accumulator loops. Writing back to the accumulator allows the coalescer
-// to remove extra copies in the loop.
-// FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937).
-MachineBasicBlock *
-X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
- MachineBasicBlock *MBB) const {
- MachineOperand &AddendOp = MI.getOperand(3);
-
- // Bail out early if the addend isn't a register - we can't switch these.
- if (!AddendOp.isReg())
- return MBB;
-
- MachineFunction &MF = *MBB->getParent();
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- // Check whether the addend is defined by a PHI:
- assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
- MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
- if (!AddendDef.isPHI())
- return MBB;
-
- // Look for the following pattern:
- // loop:
- // %addend = phi [%entry, 0], [%loop, %result]
- // ...
- // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
-
- // Replace with:
- // loop:
- // %addend = phi [%entry, 0], [%loop, %result]
- // ...
- // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
-
- for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
- assert(AddendDef.getOperand(i).isReg());
- MachineOperand PHISrcOp = AddendDef.getOperand(i);
- MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
- if (&PHISrcInst == &MI) {
- // Found a matching instruction.
- unsigned NewFMAOpc = 0;
- switch (MI.getOpcode()) {
- case X86::VFMADDPDr213r:
- NewFMAOpc = X86::VFMADDPDr231r;
- break;
- case X86::VFMADDPSr213r:
- NewFMAOpc = X86::VFMADDPSr231r;
- break;
- case X86::VFMADDSDr213r:
- NewFMAOpc = X86::VFMADDSDr231r;
- break;
- case X86::VFMADDSSr213r:
- NewFMAOpc = X86::VFMADDSSr231r;
- break;
- case X86::VFMSUBPDr213r:
- NewFMAOpc = X86::VFMSUBPDr231r;
- break;
- case X86::VFMSUBPSr213r:
- NewFMAOpc = X86::VFMSUBPSr231r;
- break;
- case X86::VFMSUBSDr213r:
- NewFMAOpc = X86::VFMSUBSDr231r;
- break;
- case X86::VFMSUBSSr213r:
- NewFMAOpc = X86::VFMSUBSSr231r;
- break;
- case X86::VFNMADDPDr213r:
- NewFMAOpc = X86::VFNMADDPDr231r;
- break;
- case X86::VFNMADDPSr213r:
- NewFMAOpc = X86::VFNMADDPSr231r;
- break;
- case X86::VFNMADDSDr213r:
- NewFMAOpc = X86::VFNMADDSDr231r;
- break;
- case X86::VFNMADDSSr213r:
- NewFMAOpc = X86::VFNMADDSSr231r;
- break;
- case X86::VFNMSUBPDr213r:
- NewFMAOpc = X86::VFNMSUBPDr231r;
- break;
- case X86::VFNMSUBPSr213r:
- NewFMAOpc = X86::VFNMSUBPSr231r;
- break;
- case X86::VFNMSUBSDr213r:
- NewFMAOpc = X86::VFNMSUBSDr231r;
- break;
- case X86::VFNMSUBSSr213r:
- NewFMAOpc = X86::VFNMSUBSSr231r;
- break;
- case X86::VFMADDSUBPDr213r:
- NewFMAOpc = X86::VFMADDSUBPDr231r;
- break;
- case X86::VFMADDSUBPSr213r:
- NewFMAOpc = X86::VFMADDSUBPSr231r;
- break;
- case X86::VFMSUBADDPDr213r:
- NewFMAOpc = X86::VFMSUBADDPDr231r;
- break;
- case X86::VFMSUBADDPSr213r:
- NewFMAOpc = X86::VFMSUBADDPSr231r;
- break;
-
- case X86::VFMADDPDr213rY:
- NewFMAOpc = X86::VFMADDPDr231rY;
- break;
- case X86::VFMADDPSr213rY:
- NewFMAOpc = X86::VFMADDPSr231rY;
- break;
- case X86::VFMSUBPDr213rY:
- NewFMAOpc = X86::VFMSUBPDr231rY;
- break;
- case X86::VFMSUBPSr213rY:
- NewFMAOpc = X86::VFMSUBPSr231rY;
- break;
- case X86::VFNMADDPDr213rY:
- NewFMAOpc = X86::VFNMADDPDr231rY;
- break;
- case X86::VFNMADDPSr213rY:
- NewFMAOpc = X86::VFNMADDPSr231rY;
- break;
- case X86::VFNMSUBPDr213rY:
- NewFMAOpc = X86::VFNMSUBPDr231rY;
- break;
- case X86::VFNMSUBPSr213rY:
- NewFMAOpc = X86::VFNMSUBPSr231rY;
- break;
- case X86::VFMADDSUBPDr213rY:
- NewFMAOpc = X86::VFMADDSUBPDr231rY;
- break;
- case X86::VFMADDSUBPSr213rY:
- NewFMAOpc = X86::VFMADDSUBPSr231rY;
- break;
- case X86::VFMSUBADDPDr213rY:
- NewFMAOpc = X86::VFMSUBADDPDr231rY;
- break;
- case X86::VFMSUBADDPSr213rY:
- NewFMAOpc = X86::VFMSUBADDPSr231rY;
- break;
- default:
- llvm_unreachable("Unrecognized FMA variant.");
- }
-
- const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- MachineInstrBuilder MIB =
- BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
- .addOperand(MI.getOperand(0))
- .addOperand(MI.getOperand(3))
- .addOperand(MI.getOperand(2))
- .addOperand(MI.getOperand(1));
- MBB->insert(MachineBasicBlock::iterator(MI), MIB);
- MI.eraseFromParent();
- }
- }
-
- return MBB;
-}
-
MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TAILJMPd64:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
- case X86::TAILJMPd64_REX:
case X86::TAILJMPr64_REX:
case X86::TAILJMPm64_REX:
llvm_unreachable("TAILJMP64 would not be touched here.");
@@ -24423,8 +25911,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::RDFLAGS32:
case X86::RDFLAGS64: {
- DebugLoc DL = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
unsigned PushF =
MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
@@ -24442,8 +25928,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::WRFLAGS32:
case X86::WRFLAGS64: {
- DebugLoc DL = MI.getDebugLoc();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
unsigned Push =
MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
unsigned PopF =
@@ -24468,19 +25952,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::FP80_TO_INT16_IN_MEM:
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
- MachineFunction *F = BB->getParent();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
-
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
- int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+ int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FNSTCW16m)), CWFrameIdx);
// Load the old value of the high byte of the control word...
unsigned OldCW =
- F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+ MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
CWFrameIdx);
@@ -24588,39 +26068,57 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
- case X86::VFMADDPDr213r:
- case X86::VFMADDPSr213r:
- case X86::VFMADDSDr213r:
- case X86::VFMADDSSr213r:
- case X86::VFMSUBPDr213r:
- case X86::VFMSUBPSr213r:
- case X86::VFMSUBSDr213r:
- case X86::VFMSUBSSr213r:
- case X86::VFNMADDPDr213r:
- case X86::VFNMADDPSr213r:
- case X86::VFNMADDSDr213r:
- case X86::VFNMADDSSr213r:
- case X86::VFNMSUBPDr213r:
- case X86::VFNMSUBPSr213r:
- case X86::VFNMSUBSDr213r:
- case X86::VFNMSUBSSr213r:
- case X86::VFMADDSUBPDr213r:
- case X86::VFMADDSUBPSr213r:
- case X86::VFMSUBADDPDr213r:
- case X86::VFMSUBADDPSr213r:
- case X86::VFMADDPDr213rY:
- case X86::VFMADDPSr213rY:
- case X86::VFMSUBPDr213rY:
- case X86::VFMSUBPSr213rY:
- case X86::VFNMADDPDr213rY:
- case X86::VFNMADDPSr213rY:
- case X86::VFNMSUBPDr213rY:
- case X86::VFNMSUBPSr213rY:
- case X86::VFMADDSUBPDr213rY:
- case X86::VFMADDSUBPSr213rY:
- case X86::VFMSUBADDPDr213rY:
- case X86::VFMSUBADDPSr213rY:
- return emitFMA3Instr(MI, BB);
+ case X86::LCMPXCHG8B: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
+ // requires a memory operand. If it happens that current architecture is
+ // i686 and for current function we need a base pointer
+ // - which is ESI for i686 - register allocator would not be able to
+ // allocate registers for an address in form of X(%reg, %reg, Y)
+ // - there never would be enough unreserved registers during regalloc
+ // (without the need for base ptr the only option would be X(%edi, %esi, Y).
+ // We are giving a hand to register allocator by precomputing the address in
+ // a new vreg using LEA.
+
+ // If it is not i686 or there is no base pointer - nothing to do here.
+ if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
+ return BB;
+
+ // Even though this code does not necessarily needs the base pointer to
+ // be ESI, we check for that. The reason: if this assert fails, there are
+ // some changes happened in the compiler base pointer handling, which most
+ // probably have to be addressed somehow here.
+ assert(TRI->getBaseRegister() == X86::ESI &&
+ "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
+ "base pointer in mind");
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MVT SPTy = getPointerTy(MF->getDataLayout());
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
+
+ X86AddressMode AM = getAddressFromInstr(&MI, 0);
+ // Regalloc does not need any help when the memory operand of CMPXCHG8B
+ // does not use index register.
+ if (AM.IndexReg == X86::NoRegister)
+ return BB;
+
+ // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
+ // four operand definitions that are E[ABCD] registers. We skip them and
+ // then insert the LEA.
+ MachineBasicBlock::iterator MBBI(MI);
+ while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
+ MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
+ --MBBI;
+ addFullAddress(
+ BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
+
+ setDirectAddressInInstr(&MI, 0, computedAddrVReg);
+
+ return BB;
+ }
+ case X86::LCMPXCHG16B:
+ return BB;
case X86::LCMPXCHG8B_SAVE_EBX:
case X86::LCMPXCHG16B_SAVE_RBX: {
unsigned BasePtr =
@@ -24667,7 +26165,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
// These nodes' second result is a boolean.
if (Op.getResNo() == 0)
break;
- // Fallthrough
+ LLVM_FALLTHROUGH;
case X86ISD::SETCC:
KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
break;
@@ -24676,16 +26174,36 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
break;
}
+ case X86ISD::VZEXT: {
+ SDValue N0 = Op.getOperand(0);
+ unsigned NumElts = Op.getValueType().getVectorNumElements();
+ unsigned InNumElts = N0.getValueType().getVectorNumElements();
+ unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
+
+ KnownZero = KnownOne = APInt(InBitWidth, 0);
+ APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
+ DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
+ KnownOne = KnownOne.zext(BitWidth);
+ KnownZero = KnownZero.zext(BitWidth);
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
+ break;
+ }
}
}
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
- SDValue Op,
- const SelectionDAG &,
- unsigned Depth) const {
+ SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
if (Op.getOpcode() == X86ISD::SETCC_CARRY)
- return Op.getValueType().getScalarSizeInBits();
+ return Op.getScalarValueSizeInBits();
+
+ if (Op.getOpcode() == X86ISD::VSEXT) {
+ EVT VT = Op.getValueType();
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+ return Tmp;
+ }
// Fallback case.
return 1;
@@ -24706,171 +26224,113 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
return TargetLowering::isGAPlusOffset(N, GA, Offset);
}
-/// Performs shuffle combines for 256-bit vectors.
-/// FIXME: This could be expanded to support 512 bit vectors as well.
-static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- SDLoc dl(N);
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- MVT VT = SVOp->getSimpleValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
-
- if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
- V2.getOpcode() == ISD::CONCAT_VECTORS) {
- //
- // 0,0,0,...
- // |
- // V UNDEF BUILD_VECTOR UNDEF
- // \ / \ /
- // CONCAT_VECTOR CONCAT_VECTOR
- // \ /
- // \ /
- // RESULT: V + zero extended
- //
- if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
- !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
- return SDValue();
-
- if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
- return SDValue();
-
- // To match the shuffle mask, the first half of the mask should
- // be exactly the first vector, and all the rest a splat with the
- // first element of the second one.
- for (unsigned i = 0; i != NumElems/2; ++i)
- if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
- !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
- return SDValue();
-
- // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
- if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
- if (Ld->hasNUsesOfValue(1, 0)) {
- SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
- SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
- SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
- Ld->getMemoryVT(),
- Ld->getPointerInfo(),
- Ld->getAlignment(),
- false/*isVolatile*/, true/*ReadMem*/,
- false/*WriteMem*/);
-
- // Make sure the newly-created LOAD is in the same position as Ld in
- // terms of dependency. We create a TokenFactor for Ld and ResNode,
- // and update uses of Ld's output chain to use the TokenFactor.
- if (Ld->hasAnyUseOfValue(1)) {
- SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
- SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
- SDValue(ResNode.getNode(), 1));
- }
-
- return DAG.getBitcast(VT, ResNode);
- }
- }
-
- // Emit a zeroed vector and insert the desired subvector on its
- // first half.
- SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
- return DCI.CombineTo(N, InsV);
- }
-
- return SDValue();
-}
-
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
+static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool FloatDomain,
const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &ShuffleVT) {
- bool FloatDomain = SrcVT.isFloatingPoint() ||
- (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
+ unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
- // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
- if (!FloatDomain && SrcVT.is128BitVector() &&
- isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
+ // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+ if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+ isUndefOrEqual(Mask[0], 0) &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
Shuffle = X86ISD::VZEXT_MOVL;
- ShuffleVT = MVT::v2i64;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
return true;
}
+ // Match against a VZEXT instruction.
+ // TODO: Add 256/512-bit vector support.
+ if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
+ unsigned MaxScale = 64 / MaskEltSize;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
+ bool Match = true;
+ unsigned NumDstElts = NumMaskElts / Scale;
+ for (unsigned i = 0; i != NumDstElts && Match; ++i) {
+ Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
+ Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
+ }
+ if (Match) {
+ SrcVT = MaskVT;
+ DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
+ DstVT = MVT::getVectorVT(DstVT, NumDstElts);
+ Shuffle = X86ISD::VZEXT;
+ return true;
+ }
+ }
+ }
+
// Check if we have SSE3 which will let us use MOVDDUP etc. The
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
- if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+ if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
- ShuffleVT = MVT::v2f64;
+ SrcVT = DstVT = MVT::v2f64;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVSLDUP;
- ShuffleVT = MVT::v4f32;
+ SrcVT = DstVT = MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
Shuffle = X86ISD::MOVSHDUP;
- ShuffleVT = MVT::v4f32;
+ SrcVT = DstVT = MVT::v4f32;
return true;
}
}
- if (SrcVT.is256BitVector() && FloatDomain) {
+ if (MaskVT.is256BitVector() && FloatDomain) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVDDUP;
- ShuffleVT = MVT::v4f64;
+ SrcVT = DstVT = MVT::v4f64;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
Shuffle = X86ISD::MOVSLDUP;
- ShuffleVT = MVT::v8f32;
+ SrcVT = DstVT = MVT::v8f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
Shuffle = X86ISD::MOVSHDUP;
- ShuffleVT = MVT::v8f32;
+ SrcVT = DstVT = MVT::v8f32;
return true;
}
}
- if (SrcVT.is512BitVector() && FloatDomain) {
+ if (MaskVT.is512BitVector() && FloatDomain) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
Shuffle = X86ISD::MOVDDUP;
- ShuffleVT = MVT::v8f64;
+ SrcVT = DstVT = MVT::v8f64;
return true;
}
if (isTargetShuffleEquivalent(
Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
Shuffle = X86ISD::MOVSLDUP;
- ShuffleVT = MVT::v16f32;
+ SrcVT = DstVT = MVT::v16f32;
return true;
}
if (isTargetShuffleEquivalent(
Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
Shuffle = X86ISD::MOVSHDUP;
- ShuffleVT = MVT::v16f32;
+ SrcVT = DstVT = MVT::v16f32;
return true;
}
}
// Attempt to match against broadcast-from-vector.
if (Subtarget.hasAVX2()) {
- unsigned NumElts = Mask.size();
- SmallVector<int, 64> BroadcastMask(NumElts, 0);
+ SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
- unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
- ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
- : MVT::getIntegerVT(EltSize);
- ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
+ SrcVT = DstVT = MaskVT;
Shuffle = X86ISD::VBROADCAST;
return true;
}
@@ -24882,19 +26342,44 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
// Attempt to match a combined shuffle mask against supported unary immediate
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
- const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &ShuffleVT,
- unsigned &PermuteImm) {
- // Ensure we don't contain any zero elements.
- for (int M : Mask) {
- if (M == SM_SentinelZero)
- return false;
- assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
- "Expected unary shuffle");
+static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool FloatDomain,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
+ unsigned NumMaskElts = Mask.size();
+
+ bool ContainsZeros = false;
+ SmallBitVector Zeroable(NumMaskElts, false);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ Zeroable[i] = isUndefOrZero(M);
+ ContainsZeros |= (M == SM_SentinelZero);
+ }
+
+ // Attempt to match against byte/bit shifts.
+ // FIXME: Add 512-bit support.
+ if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+ MaskVT.getScalarSizeInBits(), Mask,
+ 0, Zeroable, Subtarget);
+ if (0 < ShiftAmt) {
+ PermuteImm = (unsigned)ShiftAmt;
+ return true;
+ }
}
- unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
+ // Ensure we don't contain any zero elements.
+ if (ContainsZeros)
+ return false;
+
+ assert(llvm::all_of(Mask, [&](int M) {
+ return SM_SentinelUndef <= M && M < (int)NumMaskElts;
+ }) && "Expected unary shuffle");
+
+ unsigned InputSizeInBits = MaskVT.getSizeInBits();
+ unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
// Handle PSHUFLW/PSHUFHW repeated patterns.
@@ -24908,7 +26393,7 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
if (isUndefOrInRange(LoMask, 0, 4) &&
isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
Shuffle = X86ISD::PSHUFLW;
- ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+ ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
PermuteImm = getV4X86ShuffleImm(LoMask);
return true;
}
@@ -24922,7 +26407,7 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
Shuffle = X86ISD::PSHUFHW;
- ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
+ ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
return true;
}
@@ -24938,24 +26423,23 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
- bool FloatDomain = SrcVT.isFloatingPoint();
if (FloatDomain && !Subtarget.hasAVX())
return false;
// Pre-AVX2 we must use float shuffles on 256-bit vectors.
- if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
+ if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
FloatDomain = true;
// Check for lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
- if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
+ if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
PermuteImm = getV4X86ShuffleImm(Mask);
return true;
}
- if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
+ if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
Shuffle = X86ISD::VPERMI;
@@ -24994,7 +26478,7 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
- ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
PermuteImm = getV4X86ShuffleImm(WordMask);
return true;
}
@@ -25002,47 +26486,259 @@ static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
- unsigned &Shuffle, MVT &ShuffleVT) {
- bool FloatDomain = SrcVT.isFloatingPoint();
+static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool FloatDomain, SDValue &V1, SDValue &V2,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ bool IsUnary) {
+ unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
- if (SrcVT.is128BitVector()) {
+ if (MaskVT.is128BitVector()) {
if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+ V2 = V1;
Shuffle = X86ISD::MOVLHPS;
ShuffleVT = MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+ V2 = V1;
Shuffle = X86ISD::MOVHLPS;
ShuffleVT = MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
- Shuffle = X86ISD::UNPCKL;
- ShuffleVT = MVT::v4f32;
+ if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
+ (FloatDomain || !Subtarget.hasSSE41())) {
+ std::swap(V1, V2);
+ Shuffle = X86ISD::MOVSD;
+ ShuffleVT = MaskVT;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
- Shuffle = X86ISD::UNPCKH;
- ShuffleVT = MVT::v4f32;
+ if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
+ (FloatDomain || !Subtarget.hasSSE41())) {
+ Shuffle = X86ISD::MOVSS;
+ ShuffleVT = MaskVT;
+ return true;
+ }
+ }
+
+ // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
+ if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
+ MVT LegalVT = MaskVT;
+ if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
+ LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+
+ SmallVector<int, 64> Unpckl, Unpckh;
+ if (IsUnary) {
+ createUnpackShuffleMask(MaskVT, Unpckl, true, true);
+ if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+ V2 = V1;
+ Shuffle = X86ISD::UNPCKL;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+
+ createUnpackShuffleMask(MaskVT, Unpckh, false, true);
+ if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+ V2 = V1;
+ Shuffle = X86ISD::UNPCKH;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+ } else {
+ createUnpackShuffleMask(MaskVT, Unpckl, true, false);
+ if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+ Shuffle = X86ISD::UNPCKL;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+
+ createUnpackShuffleMask(MaskVT, Unpckh, false, false);
+ if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+ Shuffle = X86ISD::UNPCKH;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+ std::swap(V1, V2);
+ Shuffle = X86ISD::UNPCKL;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+ std::swap(V1, V2);
+ Shuffle = X86ISD::UNPCKH;
+ ShuffleVT = LegalVT;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ bool FloatDomain,
+ SDValue &V1, SDValue &V2,
+ SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &PermuteImm) {
+ unsigned NumMaskElts = Mask.size();
+
+ // Attempt to match against PALIGNR byte rotate.
+ if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
+ if (0 < ByteRotation) {
+ Shuffle = X86ISD::PALIGNR;
+ ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
+ PermuteImm = ByteRotation;
+ return true;
+ }
+ }
+
+ // Attempt to combine to X86ISD::BLENDI.
+ if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+ (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
+ // Determine a type compatible with X86ISD::BLENDI.
+ // TODO - add 16i16 support (requires lane duplication).
+ MVT BlendVT = MaskVT;
+ if (Subtarget.hasAVX2()) {
+ if (BlendVT == MVT::v4i64)
+ BlendVT = MVT::v8i32;
+ else if (BlendVT == MVT::v2i64)
+ BlendVT = MVT::v4i32;
+ } else {
+ if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
+ BlendVT = MVT::v8i16;
+ else if (BlendVT == MVT::v4i64)
+ BlendVT = MVT::v4f64;
+ else if (BlendVT == MVT::v8i32)
+ BlendVT = MVT::v8f32;
+ }
+
+ unsigned BlendSize = BlendVT.getVectorNumElements();
+ unsigned MaskRatio = BlendSize / NumMaskElts;
+
+ // Can we blend with zero?
+ if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
+ /*Low*/ 0) &&
+ NumMaskElts <= BlendVT.getVectorNumElements()) {
+ PermuteImm = 0;
+ for (unsigned i = 0; i != BlendSize; ++i)
+ if (Mask[i / MaskRatio] < 0)
+ PermuteImm |= 1u << i;
+
+ V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
+ Shuffle = X86ISD::BLENDI;
+ ShuffleVT = BlendVT;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
- isTargetShuffleEquivalent(
- Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
- Shuffle = X86ISD::UNPCKL;
- ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
+
+ // Attempt to match as a binary blend.
+ if (NumMaskElts <= BlendVT.getVectorNumElements()) {
+ bool MatchBlend = true;
+ for (int i = 0; i != (int)NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ else if (M == SM_SentinelZero)
+ MatchBlend = false;
+ else if ((M != i) && (M != (i + (int)NumMaskElts)))
+ MatchBlend = false;
+ }
+
+ if (MatchBlend) {
+ PermuteImm = 0;
+ for (unsigned i = 0; i != BlendSize; ++i)
+ if ((int)NumMaskElts <= Mask[i / MaskRatio])
+ PermuteImm |= 1u << i;
+
+ Shuffle = X86ISD::BLENDI;
+ ShuffleVT = BlendVT;
+ return true;
+ }
+ }
+ }
+
+ // Attempt to combine to INSERTPS.
+ if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
+ SmallBitVector Zeroable(4, false);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] < 0)
+ Zeroable[i] = true;
+
+ if (Zeroable.any() &&
+ matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ Shuffle = X86ISD::INSERTPS;
+ ShuffleVT = MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
- isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
- 13, 14, 14, 15, 15})) {
- Shuffle = X86ISD::UNPCKH;
- ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
+ }
+
+ // Attempt to combine to SHUFPD.
+ if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
+ (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
+ if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+ Shuffle = X86ISD::SHUFP;
+ ShuffleVT = MaskVT;
return true;
}
}
+ // Attempt to combine to SHUFPS.
+ if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+ SmallVector<int, 4> RepeatedMask;
+ if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+ auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
+ int M0 = RepeatedMask[Offset];
+ int M1 = RepeatedMask[Offset + 1];
+
+ if (isUndefInRange(RepeatedMask, Offset, 2)) {
+ return DAG.getUNDEF(MaskVT);
+ } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : 0);
+ S1 = (SM_SentinelUndef == M1 ? -1 : 1);
+ return getZeroVector(MaskVT, Subtarget, DAG, DL);
+ } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+ S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+ return V1;
+ } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+ S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+ return V2;
+ }
+
+ return SDValue();
+ };
+
+ int ShufMask[4] = {-1, -1, -1, -1};
+ SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
+ SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
+
+ if (Lo && Hi) {
+ V1 = Lo;
+ V2 = Hi;
+ Shuffle = X86ISD::SHUFP;
+ ShuffleVT = MaskVT;
+ PermuteImm = getV4X86ShuffleImm(ShufMask);
+ return true;
+ }
+ }
+ }
+
return false;
}
@@ -25055,33 +26751,44 @@ static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
/// into either a single instruction if there is a special purpose instruction
/// for this operation, or into a PSHUFB instruction which is a fully general
/// instruction but should only be used to replace chains over a certain depth.
-static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
+static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
+ assert((Inputs.size() == 1 || Inputs.size() == 2) &&
+ "Unexpected number of shuffle inputs!");
- // Find the operand that enters the chain. Note that multiple uses are OK
- // here, we're not going to remove the operand we find.
- Input = peekThroughBitcasts(Input);
+ // Find the inputs that enter the chain. Note that multiple uses are OK
+ // here, we're not going to remove the operands we find.
+ bool UnaryShuffle = (Inputs.size() == 1);
+ SDValue V1 = peekThroughBitcasts(Inputs[0]);
+ SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
- MVT VT = Input.getSimpleValueType();
+ MVT VT1 = V1.getSimpleValueType();
+ MVT VT2 = V2.getSimpleValueType();
MVT RootVT = Root.getSimpleValueType();
- SDLoc DL(Root);
+ assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
+ VT2.getSizeInBits() == RootVT.getSizeInBits() &&
+ "Vector size mismatch");
+ SDLoc DL(Root);
SDValue Res;
unsigned NumBaseMaskElts = BaseMask.size();
if (NumBaseMaskElts == 1) {
assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
/*AddTo*/ true);
return true;
}
unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
+ bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
+ (RootVT.is256BitVector() && !Subtarget.hasAVX2());
// Don't combine if we are a AVX512/EVEX target and the mask element size
// is different from the root element size - this would prevent writemasks
@@ -25089,26 +26796,25 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
// TODO - this currently prevents all lane shuffles from occurring.
// TODO - check for writemasks usage instead of always preventing combining.
// TODO - attempt to narrow Mask back to writemask size.
- if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
- (RootSizeInBits == 512 ||
- (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
+ bool IsEVEXShuffle =
+ RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+ if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
return false;
- }
// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
// Handle 128-bit lane shuffles of 256-bit vectors.
- if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
+ // TODO - this should support binary shuffles.
+ if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
return false; // Nothing to do!
- MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
- : MVT::v4i64);
+ MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
- Res = DAG.getBitcast(ShuffleVT, Input);
+ Res = DAG.getBitcast(ShuffleVT, V1);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
DAG.getUNDEF(ShuffleVT),
@@ -25134,144 +26840,234 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
// Determine the effective mask value type.
- bool FloatDomain =
- (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
- (32 <= MaskEltSizeInBits);
+ FloatDomain &= (32 <= MaskEltSizeInBits);
MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
: MVT::getIntegerVT(MaskEltSizeInBits);
MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
+ // Only allow legal mask types.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
+ return false;
+
// Attempt to match the mask against known shuffle patterns.
- MVT ShuffleVT;
+ MVT ShuffleSrcVT, ShuffleVT;
unsigned Shuffle, PermuteImm;
- if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
- return false; // Nothing to do!
- Res = DAG.getBitcast(ShuffleVT, Input);
- DCI.AddToWorklist(Res.getNode());
- Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
- DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ if (UnaryShuffle) {
+ // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
+ // directly if we don't shuffle the lower element and we shuffle the upper
+ // (zero) elements within themselves.
+ if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
+ (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
+ unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
+ ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
+ if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
+ isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
+ /*AddTo*/ true);
+ return true;
+ }
+ }
+
+ if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
+ ShuffleSrcVT, ShuffleVT)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+ return false; // AVX512 Writemask clash.
+ Res = DAG.getBitcast(ShuffleSrcVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
+ Shuffle, ShuffleVT, PermuteImm)) {
+ if (Depth == 1 && Root.getOpcode() == Shuffle)
+ return false; // Nothing to do!
+ if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+ return false; // AVX512 Writemask clash.
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
+ DAG.getConstant(PermuteImm, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
}
- if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
- PermuteImm)) {
+ if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
+ Shuffle, ShuffleVT, UnaryShuffle)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
- Res = DAG.getBitcast(ShuffleVT, Input);
- DCI.AddToWorklist(Res.getNode());
- Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
- DAG.getConstant(PermuteImm, DL, MVT::i8));
+ if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+ return false; // AVX512 Writemask clash.
+ V1 = DAG.getBitcast(ShuffleVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(ShuffleVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
return true;
}
- if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
+ if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
+ DAG, Subtarget, Shuffle, ShuffleVT,
+ PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
- Res = DAG.getBitcast(ShuffleVT, Input);
- DCI.AddToWorklist(Res.getNode());
- Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
+ if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+ return false; // AVX512 Writemask clash.
+ V1 = DAG.getBitcast(ShuffleVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(ShuffleVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
+ DAG.getConstant(PermuteImm, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
return true;
}
- // Attempt to blend with zero.
- if (NumMaskElts <= 8 &&
- ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
- (Subtarget.hasAVX() && VT.is256BitVector()))) {
- // Convert VT to a type compatible with X86ISD::BLENDI.
- // TODO - add 16i16 support (requires lane duplication).
- MVT ShuffleVT = MaskVT;
- if (Subtarget.hasAVX2()) {
- if (ShuffleVT == MVT::v4i64)
- ShuffleVT = MVT::v8i32;
- else if (ShuffleVT == MVT::v2i64)
- ShuffleVT = MVT::v4i32;
- } else {
- if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
- ShuffleVT = MVT::v8i16;
- else if (ShuffleVT == MVT::v4i64)
- ShuffleVT = MVT::v4f64;
- else if (ShuffleVT == MVT::v8i32)
- ShuffleVT = MVT::v8f32;
- }
-
- if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
- /*Low*/ 0) &&
- NumMaskElts <= ShuffleVT.getVectorNumElements()) {
- unsigned BlendMask = 0;
- unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
- unsigned MaskRatio = ShuffleSize / NumMaskElts;
-
- if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
- return false;
-
- for (unsigned i = 0; i != ShuffleSize; ++i)
- if (Mask[i / MaskRatio] < 0)
- BlendMask |= 1u << i;
+ // Don't try to re-form single instruction chains under any circumstances now
+ // that we've done encoding canonicalization for them.
+ if (Depth < 2)
+ return false;
- SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
- Res = DAG.getBitcast(ShuffleVT, Input);
+ bool MaskContainsZeros =
+ any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+
+ if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
+ // If we have a single input lane-crossing shuffle then lower to VPERMV.
+ if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+ ((Subtarget.hasAVX2() &&
+ (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+ MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+ SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ DCI.AddToWorklist(VPermMask.getNode());
+ Res = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(Res.getNode());
- Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
- DAG.getConstant(BlendMask, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
return true;
}
- }
- // Attempt to combine to INSERTPS.
- if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
- (VT == MVT::v2f64 || VT == MVT::v4f32)) {
- SmallBitVector Zeroable(4, false);
- for (unsigned i = 0; i != NumMaskElts; ++i)
- if (Mask[i] < 0)
- Zeroable[i] = true;
+ // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
+ // vector as the second source.
+ if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasVLX() &&
+ (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+ // Adjust shuffle mask - replace SM_SentinelZero with second source index.
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] == SM_SentinelZero)
+ Mask[i] = NumMaskElts + i;
+
+ MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+ SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ DCI.AddToWorklist(VPermMask.getNode());
+ Res = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
+ DCI.AddToWorklist(Zero.getNode());
+ Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
- unsigned InsertPSMask;
- SDValue V1 = Input, V2 = Input;
- if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
- Zeroable, Mask, DAG)) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
- return false; // Nothing to do!
- V1 = DAG.getBitcast(MVT::v4f32, V1);
+ // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
+ if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+ ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasVLX() &&
+ (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+ (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+ (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+ (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+ (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+ MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+ SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ DCI.AddToWorklist(VPermMask.getNode());
+ V1 = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(V1.getNode());
- V2 = DAG.getBitcast(MVT::v4f32, V2);
+ V2 = DAG.getBitcast(MaskVT, V2);
DCI.AddToWorklist(V2.getNode());
- Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
return true;
}
- }
-
- // Don't try to re-form single instruction chains under any circumstances now
- // that we've done encoding canonicalization for them.
- if (Depth < 2)
- return false;
-
- if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
return false;
+ }
- bool MaskContainsZeros =
- llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+ // See if we can combine a single input shuffle with zeros to a bit-mask,
+ // which is much simpler than any shuffle.
+ if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
+ isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
+ APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
+ APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
+ SmallBitVector UndefElts(NumMaskElts, false);
+ SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ UndefElts[i] = true;
+ continue;
+ }
+ if (M == SM_SentinelZero)
+ continue;
+ EltBits[i] = AllOnes;
+ }
+ SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
+ DCI.AddToWorklist(BitMask.getNode());
+ Res = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(Res.getNode());
+ unsigned AndOpcode =
+ FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
+ Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
// If we have a single input shuffle with different shuffle patterns in the
// the 128-bit lanes use the variable mask to VPERMILPS.
// TODO Combine other mask types at higher depths.
- if (HasVariableMask && !MaskContainsZeros &&
+ if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
SmallVector<SDValue, 16> VPermIdx;
@@ -25283,7 +27079,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
DCI.AddToWorklist(VPermMask.getNode());
- Res = DAG.getBitcast(MaskVT, Input);
+ Res = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
DCI.AddToWorklist(Res.getNode());
@@ -25292,17 +27088,60 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
return true;
}
+ // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
+ // to VPERMIL2PD/VPERMIL2PS.
+ if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
+ MaskVT == MVT::v8f32)) {
+ // VPERMIL2 Operation.
+ // Bits[3] - Match Bit.
+ // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+ // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+ unsigned NumLanes = MaskVT.getSizeInBits() / 128;
+ unsigned NumEltsPerLane = NumMaskElts / NumLanes;
+ SmallVector<int, 8> VPerm2Idx;
+ MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
+ MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
+ unsigned M2ZImm = 0;
+ for (int M : Mask) {
+ if (M == SM_SentinelUndef) {
+ VPerm2Idx.push_back(-1);
+ continue;
+ }
+ if (M == SM_SentinelZero) {
+ M2ZImm = 2;
+ VPerm2Idx.push_back(8);
+ continue;
+ }
+ int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
+ Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
+ VPerm2Idx.push_back(Index);
+ }
+ V1 = DAG.getBitcast(MaskVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(MaskVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
+ DCI.AddToWorklist(VPerm2MaskOp.getNode());
+ Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
+ DAG.getConstant(M2ZImm, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
// If we have 3 or more shuffle instructions or a chain involving a variable
// mask, we can replace them with a single PSHUFB instruction profitably.
// Intel's manuals suggest only using PSHUFB if doing so replacing 5
// instructions, but in practice PSHUFB tends to be *very* fast so we're
// more aggressive.
- if ((Depth >= 3 || HasVariableMask) &&
- ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
- (VT.is256BitVector() && Subtarget.hasAVX2()) ||
- (VT.is512BitVector() && Subtarget.hasBWI()))) {
+ if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+ ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
SmallVector<SDValue, 16> PSHUFBMask;
- int NumBytes = VT.getSizeInBits() / 8;
+ int NumBytes = RootVT.getSizeInBits() / 8;
int Ratio = NumBytes / NumMaskElts;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / Ratio];
@@ -25319,7 +27158,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
- Res = DAG.getBitcast(ByteVT, Input);
+ Res = DAG.getBitcast(ByteVT, V1);
DCI.AddToWorklist(Res.getNode());
SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
DCI.AddToWorklist(PSHUFBMaskOp.getNode());
@@ -25330,10 +27169,135 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
return true;
}
+ // With XOP, if we have a 128-bit binary input shuffle we can always combine
+ // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
+ // slower than PSHUFB on targets that support both.
+ if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
+ Subtarget.hasXOP()) {
+ // VPPERM Mask Operation
+ // Bits[4:0] - Byte Index (0 - 31)
+ // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
+ SmallVector<SDValue, 16> VPPERMMask;
+ int NumBytes = 16;
+ int Ratio = NumBytes / NumMaskElts;
+ for (int i = 0; i < NumBytes; ++i) {
+ int M = Mask[i / Ratio];
+ if (M == SM_SentinelUndef) {
+ VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
+ continue;
+ }
+ if (M == SM_SentinelZero) {
+ VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
+ continue;
+ }
+ M = Ratio * M + i % Ratio;
+ VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+ }
+ MVT ByteVT = MVT::v16i8;
+ V1 = DAG.getBitcast(ByteVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(ByteVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
+ DCI.AddToWorklist(VPPERMMaskOp.getNode());
+ Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
// Failed to find any combines.
return false;
}
+// Attempt to constant fold all of the constant source ops.
+// Returns true if the entire shuffle is folded to a constant.
+// TODO: Extend this to merge multiple constant Ops and update the mask.
+static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
+ ArrayRef<int> Mask, SDValue Root,
+ bool HasVariableMask, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Root.getSimpleValueType();
+
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned NumMaskElts = Mask.size();
+ unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
+ unsigned NumOps = Ops.size();
+
+ // Extract constant bits from each source op.
+ bool OneUseConstantOp = false;
+ SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
+ SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+ for (unsigned i = 0; i != NumOps; ++i) {
+ SDValue SrcOp = Ops[i];
+ OneUseConstantOp |= SrcOp.hasOneUse();
+ if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
+ RawBitsOps[i]))
+ return false;
+ }
+
+ // Only fold if at least one of the constants is only used once or
+ // the combined shuffle has included a variable mask shuffle, this
+ // is to avoid constant pool bloat.
+ if (!OneUseConstantOp && !HasVariableMask)
+ return false;
+
+ // Shuffle the constant bits according to the mask.
+ SmallBitVector UndefElts(NumMaskElts, false);
+ SmallBitVector ZeroElts(NumMaskElts, false);
+ SmallBitVector ConstantElts(NumMaskElts, false);
+ SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
+ APInt::getNullValue(MaskSizeInBits));
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef) {
+ UndefElts[i] = true;
+ continue;
+ } else if (M == SM_SentinelZero) {
+ ZeroElts[i] = true;
+ continue;
+ }
+ assert(0 <= M && M < (int)(NumMaskElts * NumOps));
+
+ unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
+ unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
+
+ auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
+ if (SrcUndefElts[SrcMaskIdx]) {
+ UndefElts[i] = true;
+ continue;
+ }
+
+ auto &SrcEltBits = RawBitsOps[SrcOpIdx];
+ APInt &Bits = SrcEltBits[SrcMaskIdx];
+ if (!Bits) {
+ ZeroElts[i] = true;
+ continue;
+ }
+
+ ConstantElts[i] = true;
+ ConstantBitData[i] = Bits;
+ }
+ assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+
+ // Create the constant data.
+ MVT MaskSVT;
+ if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
+ MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
+ else
+ MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
+
+ MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
+
+ SDLoc DL(Root);
+ SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
+ DCI.AddToWorklist(CstOp.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
+ return true;
+}
+
/// \brief Fully generic combining of x86 shuffle instructions.
///
/// This should be the last combine run over the x86 shuffle instructions. Once
@@ -25350,7 +27314,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
/// instructions, and replace them with the slightly more expensive SSSE3
/// PSHUFB instruction if available. We do this as the last combining step
/// to ensure we avoid using PSHUFB if we can implement the shuffle with
-/// a suitable short sequence of other instructions. The PHUFB will either
+/// a suitable short sequence of other instructions. The PSHUFB will either
/// use a register or have to read from memory and so is slightly (but only
/// slightly) more expensive than the other shuffle instructions.
///
@@ -25363,7 +27327,8 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
/// would simplify under the threshold for PSHUFB formation because of
/// combine-ordering. To fix this, we should do the redundant instruction
/// combining in this recursive walk.
-static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
+static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
+ int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask,
int Depth, bool HasVariableMask,
SelectionDAG &DAG,
@@ -25375,8 +27340,8 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
return false;
// Directly rip through bitcasts to find the underlying operand.
- while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
- Op = Op.getOperand(0);
+ SDValue Op = SrcOps[SrcOpIndex];
+ Op = peekThroughOneUseBitcasts(Op);
MVT VT = Op.getSimpleValueType();
if (!VT.isVector())
@@ -25393,8 +27358,27 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
return false;
- assert(VT.getVectorNumElements() == OpMask.size() &&
- "Different mask size from vector size!");
+ // Add the inputs to the Ops list, avoiding duplicates.
+ SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
+
+ int InputIdx0 = -1, InputIdx1 = -1;
+ for (int i = 0, e = Ops.size(); i < e; ++i) {
+ SDValue BC = peekThroughBitcasts(Ops[i]);
+ if (Input0 && BC == peekThroughBitcasts(Input0))
+ InputIdx0 = i;
+ if (Input1 && BC == peekThroughBitcasts(Input1))
+ InputIdx1 = i;
+ }
+
+ if (Input0 && InputIdx0 < 0) {
+ InputIdx0 = SrcOpIndex;
+ Ops[SrcOpIndex] = Input0;
+ }
+ if (Input1 && InputIdx1 < 0) {
+ InputIdx1 = Ops.size();
+ Ops.push_back(Input1);
+ }
+
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||
(OpMask.size() > RootMask.size() &&
@@ -25424,6 +27408,17 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
}
int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+
+ // Just insert the scaled root mask value if it references an input other
+ // than the SrcOp we're currently inserting.
+ if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
+ (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
+ Mask.push_back(RootMaskedIdx);
+ continue;
+ }
+
+ RootMaskedIdx %= MaskWidth;
+
int OpIdx = RootMaskedIdx / OpRatio;
if (OpMask[OpIdx] < 0) {
// The incoming lanes are zero or undef, it doesn't matter which ones we
@@ -25432,17 +27427,27 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
continue;
}
- // Ok, we have non-zero lanes, map them through.
- Mask.push_back(OpMask[OpIdx] * OpRatio +
- RootMaskedIdx % OpRatio);
+ // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
+ int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
+ OpMaskedIdx %= MaskWidth;
+
+ if (OpMask[OpIdx] < (int)OpMask.size()) {
+ assert(0 <= InputIdx0 && "Unknown target shuffle input");
+ OpMaskedIdx += InputIdx0 * MaskWidth;
+ } else {
+ assert(0 <= InputIdx1 && "Unknown target shuffle input");
+ OpMaskedIdx += InputIdx1 * MaskWidth;
+ }
+
+ Mask.push_back(OpMaskedIdx);
}
// Handle the all undef/zero cases early.
- if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
+ if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
return true;
}
- if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
+ if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
// TODO - should we handle the mixed zero/undef case as well? Just returning
// a zero mask will lose information on undef elements possibly reducing
// future combine possibilities.
@@ -25451,30 +27456,40 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
return true;
}
- int MaskSize = Mask.size();
- bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
- [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
- bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
- [MaskSize](int Idx) { return MaskSize <= Idx; });
-
- // At the moment we can only combine unary shuffle mask cases.
- if (UseInput0 && UseInput1)
- return false;
- else if (UseInput1) {
- std::swap(Input0, Input1);
- ShuffleVectorSDNode::commuteMask(Mask);
+ // Remove unused shuffle source ops.
+ SmallVector<SDValue, 8> UsedOps;
+ for (int i = 0, e = Ops.size(); i < e; ++i) {
+ int lo = UsedOps.size() * MaskWidth;
+ int hi = lo + MaskWidth;
+ if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+ UsedOps.push_back(Ops[i]);
+ continue;
+ }
+ for (int &M : Mask)
+ if (lo <= M)
+ M -= MaskWidth;
}
-
- assert(Input0 && "Shuffle with no inputs detected");
+ assert(!UsedOps.empty() && "Shuffle with no inputs detected");
+ Ops = UsedOps;
HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
- // See if we can recurse into Input0 (if it's a target shuffle).
- if (Op->isOnlyUserOf(Input0.getNode()) &&
- combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
- HasVariableMask, DAG, DCI, Subtarget))
+ // See if we can recurse into each shuffle source op (if it's a target shuffle).
+ for (int i = 0, e = Ops.size(); i < e; ++i)
+ if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
+ if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
+ HasVariableMask, DAG, DCI, Subtarget))
+ return true;
+
+ // Attempt to constant fold all of the constant source ops.
+ if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
+ Subtarget))
return true;
+ // We can only combine unary and binary shuffle mask cases.
+ if (Ops.size() > 2)
+ return false;
+
// Minor canonicalization of the accumulated shuffle mask to make it easier
// to match below. All this does is detect masks with sequential pairs of
// elements, and shrink them to the half-width mask. It does this in a loop
@@ -25485,7 +27500,14 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
Mask = std::move(WidenedMask);
}
- return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
+ // Canonicalization of binary shuffle masks to improve pattern matching by
+ // commuting the inputs.
+ if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ }
+
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
DCI, Subtarget);
}
@@ -25612,7 +27634,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
Chain.push_back(V);
- // Fallthrough!
+ LLVM_FALLTHROUGH;
case ISD::BITCAST:
V = V.getOperand(0);
continue;
@@ -25742,7 +27764,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
- switch (N.getOpcode()) {
+ unsigned Opcode = N.getOpcode();
+ switch (Opcode) {
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
@@ -25750,6 +27773,17 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
assert(Mask.size() == 4);
break;
case X86ISD::UNPCKL: {
+ auto Op0 = N.getOperand(0);
+ auto Op1 = N.getOperand(1);
+ unsigned Opcode0 = Op0.getOpcode();
+ unsigned Opcode1 = Op1.getOpcode();
+
+ // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
+ // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
+ // TODO: Add other horizontal operations as required.
+ if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
+ return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
+
// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
// moves upper half elements into the lower half part. For example:
@@ -25767,9 +27801,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
if (!VT.is128BitVector())
return SDValue();
- auto Op0 = N.getOperand(0);
- auto Op1 = N.getOperand(1);
- if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
unsigned NumElts = VT.getVectorNumElements();
@@ -25806,44 +27838,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
}
- // Attempt to merge blend(insertps(x,y),zero).
- if (V0.getOpcode() == X86ISD::INSERTPS ||
- V1.getOpcode() == X86ISD::INSERTPS) {
- assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
-
- // Determine which elements are known to be zero.
- SmallVector<int, 8> TargetMask;
- SmallVector<SDValue, 2> BlendOps;
- if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
- return SDValue();
-
- // Helper function to take inner insertps node and attempt to
- // merge the blend with zero into its zero mask.
- auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
- if (V.getOpcode() != X86ISD::INSERTPS)
- return SDValue();
- SDValue Op0 = V.getOperand(0);
- SDValue Op1 = V.getOperand(1);
- SDValue Op2 = V.getOperand(2);
- unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
-
- // Check each element of the blend node's target mask - must either
- // be zeroable (and update the zero mask) or selects the element from
- // the inner insertps node.
- for (int i = 0; i != 4; ++i)
- if (TargetMask[i] < 0)
- InsertPSMask |= (1u << i);
- else if (TargetMask[i] != (i + Offset))
- return SDValue();
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
- };
-
- if (SDValue V = MergeInsertPSAndBlend(V0, 0))
- return V;
- if (SDValue V = MergeInsertPSAndBlend(V1, 4))
- return V;
+ return SDValue();
+ }
+ case X86ISD::MOVSD:
+ case X86ISD::MOVSS: {
+ bool isFloat = VT.isFloatingPoint();
+ SDValue V0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue V1 = peekThroughBitcasts(N->getOperand(1));
+ bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+ bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
+ bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
+ bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
+ assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+
+ // We often lower to MOVSD/MOVSS from integer as well as native float
+ // types; remove unnecessary domain-crossing bitcasts if we can to make it
+ // easier to combine shuffles later on. We've already accounted for the
+ // domain switching cost when we decided to lower with it.
+ if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
+ MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
+ : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
+ V0 = DAG.getBitcast(NewVT, V0);
+ V1 = DAG.getBitcast(NewVT, V1);
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
}
+
return SDValue();
}
case X86ISD::INSERTPS: {
@@ -25976,9 +27995,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
V.getOpcode() == X86ISD::PSHUFHW) &&
V.getOpcode() != N.getOpcode() &&
V.hasOneUse()) {
- SDValue D = V.getOperand(0);
- while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
- D = D.getOperand(0);
+ SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
@@ -26017,31 +28034,32 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
-/// \brief Try to combine a shuffle into a target-specific add-sub node.
+/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
+/// operation. If true is returned then the operands of ADDSUB operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
///
-/// We combine this directly on the abstract vector shuffle nodes so it is
-/// easier to generically match. We also insert dummy vector shuffle nodes for
-/// the operands which explicitly discard the lanes which are unused by this
-/// operation to try to flow through the rest of the combiner the fact that
-/// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- SDLoc DL(N);
+/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
+/// so it is easier to generically match. We also insert dummy vector shuffle
+/// nodes for the operands which explicitly discard the lanes which are unused
+/// by this operation to try to flow through the rest of the combiner
+/// the fact that they're unused.
+static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
+ SDValue &Opnd0, SDValue &Opnd1) {
+
EVT VT = N->getValueType(0);
if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
- return SDValue();
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
+ (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+ return false;
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
// extraction tool to support more.
if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
- return SDValue();
+ return false;
- auto *SVN = cast<ShuffleVectorSDNode>(N);
- SmallVector<int, 8> Mask;
- for (int M : SVN->getMask())
- Mask.push_back(M);
+ ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
+ SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
@@ -26052,27 +28070,102 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
ShuffleVectorSDNode::commuteMask(Mask);
std::swap(V1, V2);
} else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
- return SDValue();
+ return false;
// If there are other uses of these operations we can't fold them.
if (!V1->hasOneUse() || !V2->hasOneUse())
- return SDValue();
+ return false;
// Ensure that both operations have the same operands. Note that we can
// commute the FADD operands.
SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
- return SDValue();
+ return false;
// We're looking for blends between FADD and FSUB nodes. We insist on these
// nodes being lined up in a specific expected pattern.
if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
- isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
+ isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
+ 8, 25, 10, 27, 12, 29, 14, 31})))
+ return false;
+
+ Opnd0 = LHS;
+ Opnd1 = RHS;
+ return true;
+}
+
+/// \brief Try to combine a shuffle into a target-specific add-sub or
+/// mul-add-sub node.
+static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Opnd0, Opnd1;
+ if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
return SDValue();
- return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Try to generate X86ISD::FMADDSUB node here.
+ SDValue Opnd2;
+ if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+ return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+
+ // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+ // the ADDSUB idiom has been successfully recognized. There are no known
+ // X86 targets with 512-bit ADDSUB instructions!
+ if (VT.is512BitVector())
+ return SDValue();
+
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
+}
+
+// We are looking for a shuffle where both sources are concatenated with undef
+// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
+// if we can express this as a single-source shuffle, that's preferable.
+static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+
+ if (VT.getVectorElementType() != MVT::i32 &&
+ VT.getVectorElementType() != MVT::i64 &&
+ VT.getVectorElementType() != MVT::f32 &&
+ VT.getVectorElementType() != MVT::f64)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Check that both sources are concats with undef.
+ if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
+ N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
+ N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
+ !N1.getOperand(1).isUndef())
+ return SDValue();
+
+ // Construct the new shuffle mask. Elements from the first source retain their
+ // index, but elements from the second source no longer need to skip an undef.
+ SmallVector<int, 8> Mask;
+ int NumElts = VT.getVectorNumElements();
+
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ for (int Elt : SVOp->getMask())
+ Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
+
+ SDLoc DL(N);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
+ N1.getOperand(0));
+ return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
}
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
@@ -26089,14 +28182,9 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB node.
if (TLI.isTypeLegal(VT))
- if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
+ if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
- // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
- if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
- N->getOpcode() == ISD::VECTOR_SHUFFLE)
- return combineShuffle256(N, DAG, DCI, Subtarget);
-
// During Type Legalization, when promoting illegal vector types,
// the backend might introduce new shuffle dag nodes and bitcasts.
//
@@ -26127,13 +28215,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
bool CanFold = false;
switch (Opcode) {
default : break;
- case ISD::ADD :
- case ISD::FADD :
- case ISD::SUB :
- case ISD::FSUB :
- case ISD::MUL :
- case ISD::FMUL :
- CanFold = true;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ // isOperationLegal lies for integer ops on floating point types.
+ CanFold = VT.isInteger();
+ break;
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ // isOperationLegal lies for floating point ops on integer types.
+ CanFold = VT.isFloatingPoint();
+ break;
}
unsigned SVTNumElts = SVT.getVectorNumElements();
@@ -26162,9 +28255,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
return LD;
+ // For AVX2, we sometimes want to combine
+ // (vector_shuffle <mask> (concat_vectors t1, undef)
+ // (concat_vectors t2, undef))
+ // Into:
+ // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
+ // Since the latter can be efficiently lowered with VPERMD/VPERMQ
+ if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
+ return ShufConcat;
+
if (isTargetShuffle(N->getOpcode())) {
- if (SDValue Shuffle =
- combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
+ SDValue Op(N, 0);
+ if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
return Shuffle;
// Try recursively combining arbitrary sequences of x86 shuffle
@@ -26174,8 +28276,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// a particular chain.
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
- /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
}
@@ -26305,11 +28407,10 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
}
// Convert a bitcasted integer logic operation that has one bitcasted
- // floating-point operand and one constant operand into a floating-point
- // logic operation. This may create a load of the constant, but that is
- // cheaper than materializing the constant in an integer register and
- // transferring it to an SSE register or transferring the SSE operand to
- // integer register and back.
+ // floating-point operand into a floating-point logic operation. This may
+ // create a load of a constant, but that is cheaper than materializing the
+ // constant in an integer register and transferring it to an SSE register or
+ // transferring the SSE operand to integer register and back.
unsigned FPOpcode;
switch (N0.getOpcode()) {
case ISD::AND: FPOpcode = X86ISD::FAND; break;
@@ -26317,25 +28418,238 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
default: return SDValue();
}
- if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
- (Subtarget.hasSSE2() && VT == MVT::f64)) &&
- isa<ConstantSDNode>(N0.getOperand(1)) &&
- N0.getOperand(0).getOpcode() == ISD::BITCAST &&
- N0.getOperand(0).getOperand(0).getValueType() == VT) {
- SDValue N000 = N0.getOperand(0).getOperand(0);
- SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
- return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
+
+ if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
+ (Subtarget.hasSSE2() && VT == MVT::f64)))
+ return SDValue();
+
+ SDValue LogicOp0 = N0.getOperand(0);
+ SDValue LogicOp1 = N0.getOperand(1);
+ SDLoc DL0(N0);
+
+ // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
+ if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
+ LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
+ !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
+ SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
+ return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
+ }
+ // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
+ if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
+ LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
+ !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
+ SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
+ return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
}
return SDValue();
}
+// Match a binop + shuffle pyramid that represents a horizontal reduction over
+// the elements of a vector.
+// Returns the vector that is being reduced on, or SDValue() if a reduction
+// was not matched.
+static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
+ // The pattern must end in an extract from index 0.
+ if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
+ !isNullConstant(Extract->getOperand(1)))
+ return SDValue();
+
+ unsigned Stages =
+ Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
+
+ SDValue Op = Extract->getOperand(0);
+ // At each stage, we're looking for something that looks like:
+ // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
+ // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+ // i32 undef, i32 undef, i32 undef, i32 undef>
+ // %a = binop <8 x i32> %op, %s
+ // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
+ // we expect something like:
+ // <4,5,6,7,u,u,u,u>
+ // <2,3,u,u,u,u,u,u>
+ // <1,u,u,u,u,u,u,u>
+ for (unsigned i = 0; i < Stages; ++i) {
+ if (Op.getOpcode() != BinOp)
+ return SDValue();
+
+ ShuffleVectorSDNode *Shuffle =
+ dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
+ if (Shuffle) {
+ Op = Op.getOperand(1);
+ } else {
+ Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
+ Op = Op.getOperand(0);
+ }
+
+ // The first operand of the shuffle should be the same as the other operand
+ // of the add.
+ if (!Shuffle || (Shuffle->getOperand(0) != Op))
+ return SDValue();
+
+ // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+ for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
+ if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
+ return SDValue();
+ }
+
+ return Op;
+}
+
+// Given a select, detect the following pattern:
+// 1: %2 = zext <N x i8> %0 to <N x i32>
+// 2: %3 = zext <N x i8> %1 to <N x i32>
+// 3: %4 = sub nsw <N x i32> %2, %3
+// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
+// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
+// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+// This is useful as it is the input into a SAD pattern.
+static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
+ SDValue &Op1) {
+ // Check the condition of the select instruction is greater-than.
+ SDValue SetCC = Select->getOperand(0);
+ if (SetCC.getOpcode() != ISD::SETCC)
+ return false;
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+ if (CC != ISD::SETGT)
+ return false;
+
+ SDValue SelectOp1 = Select->getOperand(1);
+ SDValue SelectOp2 = Select->getOperand(2);
+
+ // The second operand of the select should be the negation of the first
+ // operand, which is implemented as 0 - SelectOp1.
+ if (!(SelectOp2.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
+ SelectOp2.getOperand(1) == SelectOp1))
+ return false;
+
+ // The first operand of SetCC is the first operand of the select, which is the
+ // difference between the two input vectors.
+ if (SetCC.getOperand(0) != SelectOp1)
+ return false;
+
+ // The second operand of the comparison can be either -1 or 0.
+ if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+ ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
+ return false;
+
+ // The first operand of the select is the difference between the two input
+ // vectors.
+ if (SelectOp1.getOpcode() != ISD::SUB)
+ return false;
+
+ Op0 = SelectOp1.getOperand(0);
+ Op1 = SelectOp1.getOperand(1);
+
+ // Check if the operands of the sub are zero-extended from vectors of i8.
+ if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
+ Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+ Op1.getOpcode() != ISD::ZERO_EXTEND ||
+ Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+ return false;
+
+ return true;
+}
+
+// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
+// to these zexts.
+static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
+ const SDValue &Zext1, const SDLoc &DL) {
+
+ // Find the appropriate width for the PSADBW.
+ EVT InVT = Zext0.getOperand(0).getValueType();
+ unsigned RegSize = std::max(128u, InVT.getSizeInBits());
+
+ // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
+ // fill in the missing vector elements with 0.
+ unsigned NumConcat = RegSize / InVT.getSizeInBits();
+ SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
+ Ops[0] = Zext0.getOperand(0);
+ MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
+ SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+ Ops[0] = Zext1.getOperand(0);
+ SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+
+ // Actually build the SAD
+ MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
+ return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
+}
+
+static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // PSADBW is only supported on SSE2 and up.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ // Verify the type we're extracting from is appropriate
+ // TODO: There's nothing special about i32, any integer type above i16 should
+ // work just as well.
+ EVT VT = Extract->getOperand(0).getValueType();
+ if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+ return SDValue();
+
+ unsigned RegSize = 128;
+ if (Subtarget.hasBWI())
+ RegSize = 512;
+ else if (Subtarget.hasAVX2())
+ RegSize = 256;
+
+ // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+ // TODO: We should be able to handle larger vectors by splitting them before
+ // feeding them into several SADs, and then reducing over those.
+ if (VT.getSizeInBits() / 4 > RegSize)
+ return SDValue();
+
+ // Match shuffle + add pyramid.
+ SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
+
+ // If there was a match, we want Root to be a select that is the root of an
+ // abs-diff pattern.
+ if (!Root || (Root.getOpcode() != ISD::VSELECT))
+ return SDValue();
+
+ // Check whether we have an abs-diff pattern feeding into the select.
+ SDValue Zext0, Zext1;
+ if (!detectZextAbsDiff(Root, Zext0, Zext1))
+ return SDValue();
+
+ // Create the SAD instruction
+ SDLoc DL(Extract);
+ SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
+
+ // If the original vector was wider than 8 elements, sum over the results
+ // in the SAD vector.
+ unsigned Stages = Log2_32(VT.getVectorNumElements());
+ MVT SadVT = SAD.getSimpleValueType();
+ if (Stages > 3) {
+ unsigned SadElems = SadVT.getVectorNumElements();
+
+ for(unsigned i = Stages - 3; i > 0; --i) {
+ SmallVector<int, 16> Mask(SadElems, -1);
+ for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
+ Mask[j] = MaskEnd + j;
+
+ SDValue Shuffle =
+ DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
+ SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
+ }
+ }
+
+ // Return the lowest i32.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+ SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+ Extract->getOperand(1));
+}
+
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
/// scalars back, while for x64 we should use 64-bit extracts and shifts.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
@@ -26347,7 +28661,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
InputVector.getValueType() == MVT::v2i32 &&
isa<ConstantSDNode>(N->getOperand(1)) &&
N->getConstantOperandVal(1) == 0) {
- SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+ SDValue MMXSrc = InputVector.getOperand(0);
// The bitcast source is a direct mmx result.
if (MMXSrc.getValueType() == MVT::x86mmx)
@@ -26366,6 +28680,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
uint64_t Res = (InputValue >> ExtractedElt) & 1;
return DAG.getConstant(Res, dl, MVT::i1);
}
+
+ // Check whether this extract is the root of a sum of absolute differences
+ // pattern. This has to be done here because we really want it to happen
+ // pre-legalization,
+ if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
+ return SAD;
+
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
if (InputVector.getValueType() != MVT::v4i32)
@@ -26467,6 +28788,310 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// If a vector select has an operand that is -1 or 0, try to simplify the
+/// select to a bitwise logic operation.
+static SDValue
+combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT VT = LHS.getValueType();
+ EVT CondVT = Cond.getValueType();
+ SDLoc DL(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (N->getOpcode() != ISD::VSELECT)
+ return SDValue();
+
+ assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ // Check if the first operand is all zeros and Cond type is vXi1.
+ // This situation only applies to avx512.
+ if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ //Invert the cond to not(cond) : xor(op,allones)=not(op)
+ SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
+ DL, CondVT));
+ //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+ return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+ }
+
+ // To use the condition operand as a bitwise mask, it must have elements that
+ // are the same size as the select elements. Ie, the condition operand must
+ // have already been promoted from the IR select condition type <N x i1>.
+ // Don't check if the types themselves are equal because that excludes
+ // vector floating-point selects.
+ if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+ FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ // Try to invert the condition if true value is not all 1s and false value is
+ // not all 0s.
+ if (!TValIsAllOnes && !FValIsAllZeros &&
+ // Check if the selector will be produced by CMPP*/PCMP*.
+ Cond.getOpcode() == ISD::SETCC &&
+ // Check if SETCC has already been promoted.
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+ CondVT) {
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+ if (TValIsAllZeros || FValIsAllOnes) {
+ SDValue CC = Cond.getOperand(2);
+ ISD::CondCode NewCC =
+ ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+ Cond.getOperand(0).getValueType().isInteger());
+ Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
+ NewCC);
+ std::swap(LHS, RHS);
+ TValIsAllOnes = FValIsAllOnes;
+ FValIsAllZeros = TValIsAllZeros;
+ }
+ }
+
+ // vselect Cond, 111..., 000... -> Cond
+ if (TValIsAllOnes && FValIsAllZeros)
+ return DAG.getBitcast(VT, Cond);
+
+ if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
+ return SDValue();
+
+ // vselect Cond, 111..., X -> or Cond, X
+ if (TValIsAllOnes) {
+ SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
+ SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
+ return DAG.getBitcast(VT, Or);
+ }
+
+ // vselect Cond, X, 000... -> and Cond, X
+ if (FValIsAllZeros) {
+ SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
+ SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
+ return DAG.getBitcast(VT, And);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ SDLoc DL(N);
+
+ auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
+ auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
+ if (!TrueC || !FalseC)
+ return SDValue();
+
+ // Don't do this for crazy integer types.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
+ return SDValue();
+
+ // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+ // so that TrueC (the true value) is larger than FalseC.
+ bool NeedsCondInvert = false;
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+ // Efficiently invertible.
+ (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
+ (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
+ isa<ConstantSDNode>(Cond.getOperand(1))))) {
+ NeedsCondInvert = true;
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+ DAG.getConstant(ShAmt, DL, MVT::i8));
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+ if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
+ return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32)
+ Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default:
+ break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, DL, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, DL, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+ }
+
+ return SDValue();
+}
+
+// If this is a bitcasted op that can be represented as another type, push the
+// the bitcast to the inputs. This allows more opportunities for pattern
+// matching masked instructions. This is called when we know that the operation
+// is used as one of the inputs of a vselect.
+static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // Make sure we have a bitcast.
+ if (OrigOp.getOpcode() != ISD::BITCAST)
+ return false;
+
+ SDValue Op = OrigOp.getOperand(0);
+
+ // If the operation is used by anything other than the bitcast, we shouldn't
+ // do this combine as that would replicate the operation.
+ if (!Op.hasOneUse())
+ return false;
+
+ MVT VT = OrigOp.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ SDLoc DL(Op.getNode());
+
+ auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
+ SDValue Op2) {
+ Op0 = DAG.getBitcast(VT, Op0);
+ DCI.AddToWorklist(Op0.getNode());
+ Op1 = DAG.getBitcast(VT, Op1);
+ DCI.AddToWorklist(Op1.getNode());
+ DCI.CombineTo(OrigOp.getNode(),
+ DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
+ return true;
+ };
+
+ unsigned Opcode = Op.getOpcode();
+ switch (Opcode) {
+ case X86ISD::PALIGNR:
+ // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
+ if (!VT.is128BitVector())
+ return false;
+ Opcode = X86ISD::VALIGN;
+ LLVM_FALLTHROUGH;
+ case X86ISD::VALIGN: {
+ if (EltVT != MVT::i32 && EltVT != MVT::i64)
+ return false;
+ uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+ unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+ // Make sure we can represent the same shift with the new VT.
+ if ((ShiftAmt % EltSize) != 0)
+ return false;
+ Imm = ShiftAmt / EltSize;
+ return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
+ DAG.getConstant(Imm, DL, MVT::i8));
+ }
+ case X86ISD::SHUF128: {
+ if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
+ return false;
+ // Only change element size, not type.
+ if (VT.isInteger() != Op.getSimpleValueType().isInteger())
+ return false;
+ return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ case ISD::INSERT_SUBVECTOR: {
+ unsigned EltSize = EltVT.getSizeInBits();
+ if (EltSize != 32 && EltSize != 64)
+ return false;
+ MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+ // Only change element size, not type.
+ if (VT.isInteger() != OpEltVT.isInteger())
+ return false;
+ uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
+ SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
+ DCI.AddToWorklist(Op0.getNode());
+ // Op1 needs to be bitcasted to a smaller vector with the same element type.
+ SDValue Op1 = Op.getOperand(1);
+ MVT Op1VT = MVT::getVectorVT(EltVT,
+ Op1.getSimpleValueType().getSizeInBits() / EltSize);
+ Op1 = DAG.getBitcast(Op1VT, Op1);
+ DCI.AddToWorklist(Op1.getNode());
+ DCI.CombineTo(OrigOp.getNode(),
+ DAG.getNode(Opcode, DL, VT, Op0, Op1,
+ DAG.getConstant(Imm, DL, MVT::i8)));
+ return true;
+ }
+ case ISD::EXTRACT_SUBVECTOR: {
+ unsigned EltSize = EltVT.getSizeInBits();
+ if (EltSize != 32 && EltSize != 64)
+ return false;
+ MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+ // Only change element size, not type.
+ if (VT.isInteger() != OpEltVT.isInteger())
+ return false;
+ uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
+ // Op0 needs to be bitcasted to a larger vector with the same element type.
+ SDValue Op0 = Op.getOperand(0);
+ MVT Op0VT = MVT::getVectorVT(EltVT,
+ Op0.getSimpleValueType().getSizeInBits() / EltSize);
+ Op0 = DAG.getBitcast(Op0VT, Op0);
+ DCI.AddToWorklist(Op0.getNode());
+ DCI.CombineTo(OrigOp.getNode(),
+ DAG.getNode(Opcode, DL, VT, Op0,
+ DAG.getConstant(Imm, DL, MVT::i8)));
+ return true;
+ }
+ }
+
+ return false;
+}
+
/// Do target-specific dag combines on SELECT and VSELECT nodes.
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -26477,6 +29102,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
EVT VT = LHS.getValueType();
+ EVT CondVT = Cond.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
@@ -26625,117 +29251,24 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
}
- EVT CondVT = Cond.getValueType();
- if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
- CondVT.getVectorElementType() == MVT::i1) {
- // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
- // lowering on KNL. In this case we convert it to
- // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
- // The same situation for all 128 and 256-bit vectors of i8 and i16.
- // Since SKX these selects have a proper lowering.
- EVT OpVT = LHS.getValueType();
- if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
- (OpVT.getVectorElementType() == MVT::i8 ||
- OpVT.getVectorElementType() == MVT::i16) &&
- !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
- Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
- DCI.AddToWorklist(Cond.getNode());
- return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
- }
+ // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+ // lowering on KNL. In this case we convert it to
+ // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+ // The same situation for all 128 and 256-bit vectors of i8 and i16.
+ // Since SKX these selects have a proper lowering.
+ if (Subtarget.hasAVX512() && CondVT.isVector() &&
+ CondVT.getVectorElementType() == MVT::i1 &&
+ (VT.is128BitVector() || VT.is256BitVector()) &&
+ (VT.getVectorElementType() == MVT::i8 ||
+ VT.getVectorElementType() == MVT::i16) &&
+ !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
+ Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+ DCI.AddToWorklist(Cond.getNode());
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
}
- // If this is a select between two integer constants, try to do some
- // optimizations.
- if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
- if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
- // Don't do this for crazy integer types.
- if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
- // If this is efficiently invertible, canonicalize the LHSC/RHSC values
- // so that TrueC (the true value) is larger than FalseC.
- bool NeedsCondInvert = false;
-
- if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
- // Efficiently invertible.
- (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
- (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
- isa<ConstantSDNode>(Cond.getOperand(1))))) {
- NeedsCondInvert = true;
- std::swap(TrueC, FalseC);
- }
-
- // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
- if (FalseC->getAPIntValue() == 0 &&
- TrueC->getAPIntValue().isPowerOf2()) {
- if (NeedsCondInvert) // Invert the condition if needed.
- Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
-
- // Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
-
- unsigned ShAmt = TrueC->getAPIntValue().logBase2();
- return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
- DAG.getConstant(ShAmt, DL, MVT::i8));
- }
-
- // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
- if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
- if (NeedsCondInvert) // Invert the condition if needed.
- Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
-
- // Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
- FalseC->getValueType(0), Cond);
- return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
- SDValue(FalseC, 0));
- }
- // Optimize cases that will turn into an LEA instruction. This requires
- // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
- if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
- uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
- if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
-
- bool isFastMultiplier = false;
- if (Diff < 10) {
- switch ((unsigned char)Diff) {
- default: break;
- case 1: // result = add base, cond
- case 2: // result = lea base( , cond*2)
- case 3: // result = lea base(cond, cond*2)
- case 4: // result = lea base( , cond*4)
- case 5: // result = lea base(cond, cond*4)
- case 8: // result = lea base( , cond*8)
- case 9: // result = lea base(cond, cond*8)
- isFastMultiplier = true;
- break;
- }
- }
-
- if (isFastMultiplier) {
- APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
- if (NeedsCondInvert) // Invert the condition if needed.
- Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
-
- // Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
- Cond);
- // Scale the condition by the difference.
- if (Diff != 1)
- Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
- DAG.getConstant(Diff, DL,
- Cond.getValueType()));
-
- // Add the base if non-zero.
- if (FalseC->getAPIntValue() != 0)
- Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
- SDValue(FalseC, 0));
- return Cond;
- }
- }
- }
- }
+ if (SDValue V = combineSelectOfTwoConstants(N, DAG))
+ return V;
// Canonicalize max and min:
// (x > y) ? x : y -> (x >= y) ? x : y
@@ -26832,53 +29365,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
- // Simplify vector selection if condition value type matches vselect
- // operand type
- if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
- assert(Cond.getValueType().isVector() &&
- "vector select expects a vector selector!");
-
- bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
- bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
-
- // Try invert the condition if true value is not all 1s and false value
- // is not all 0s.
- if (!TValIsAllOnes && !FValIsAllZeros &&
- // Check if the selector will be produced by CMPP*/PCMP*
- Cond.getOpcode() == ISD::SETCC &&
- // Check if SETCC has already been promoted
- TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
- CondVT) {
- bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
- bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
-
- if (TValIsAllZeros || FValIsAllOnes) {
- SDValue CC = Cond.getOperand(2);
- ISD::CondCode NewCC =
- ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
- Cond.getOperand(0).getValueType().isInteger());
- Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
- std::swap(LHS, RHS);
- TValIsAllOnes = FValIsAllOnes;
- FValIsAllZeros = TValIsAllZeros;
- }
- }
-
- if (TValIsAllOnes || FValIsAllZeros) {
- SDValue Ret;
-
- if (TValIsAllOnes && FValIsAllZeros)
- Ret = Cond;
- else if (TValIsAllOnes)
- Ret =
- DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
- else if (FValIsAllZeros)
- Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
- DAG.getBitcast(CondVT, LHS));
-
- return DAG.getBitcast(VT, Ret);
- }
- }
+ if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
+ return V;
// If this is a *dynamic* select (non-constant condition) and we can match
// this node with one of the variable blend instructions, restructure the
@@ -26887,7 +29375,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
- unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
+ unsigned BitWidth = Cond.getScalarValueSizeInBits();
// Don't optimize vector selects that map to mask-registers.
if (BitWidth == 1)
@@ -26965,6 +29453,17 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
+ // Look for vselects with LHS/RHS being bitcasted from an operation that
+ // can be executed on another type. Push the bitcast to the inputs of
+ // the operation. This exposes opportunities for using masking instructions.
+ if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
+ CondVT.getVectorElementType() == MVT::i1) {
+ if (combineBitcastForMaskedOp(LHS, DAG, DCI))
+ return SDValue(N, 0);
+ if (combineBitcastForMaskedOp(RHS, DAG, DCI))
+ return SDValue(N, 0);
+ }
+
return SDValue();
}
@@ -26981,6 +29480,12 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
return SDValue();
+ // Can't replace the cmp if it has more uses than the one we're looking at.
+ // FIXME: We would like to be able to handle this, but would need to make sure
+ // all uses were updated.
+ if (!Cmp.hasOneUse())
+ return SDValue();
+
// This only applies to variations of the common case:
// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
@@ -27088,7 +29593,6 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
// Skip (zext $x), (trunc $x), or (and $x, 1) node.
while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
SetCC.getOpcode() == ISD::TRUNCATE ||
- SetCC.getOpcode() == ISD::AssertZext ||
SetCC.getOpcode() == ISD::AND) {
if (SetCC.getOpcode() == ISD::AND) {
int OpIdx = -1;
@@ -27114,7 +29618,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
break;
assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
"Invalid use of SETCC_CARRY!");
- // FALL THROUGH
+ LLVM_FALLTHROUGH;
case X86ISD::SETCC:
// Set the condition code or opposite one if necessary.
CC = X86::CondCode(SetCC.getConstantOperandVal(0));
@@ -27187,7 +29691,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
case ISD::AND:
case X86ISD::AND:
isAnd = true;
- // fallthru
+ LLVM_FALLTHROUGH;
case ISD::OR:
case X86ISD::OR:
SetCC0 = Cond->getOperand(0);
@@ -27270,8 +29774,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// This is efficient for any integer data type (including i8/i16) and
// shift amount.
if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
- Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(CC, DL, MVT::i8), Cond);
+ Cond = getSETCC(CC, Cond, DL, DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
@@ -27287,8 +29790,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
// for any integer data type, including i8/i16.
if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
- Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(CC, DL, MVT::i8), Cond);
+ Cond = getSETCC(CC, Cond, DL, DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
@@ -27325,8 +29827,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
if (isFastMultiplier) {
APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
- Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(CC, DL, MVT::i8), Cond);
+ Cond = getSETCC(CC, Cond, DL ,DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
Cond);
@@ -27525,10 +30026,17 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
/// generate pmullw+pmulhuw for it (MULU16 mode).
static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // pmulld is supported since SSE41. It is better to use pmulld
- // instead of pmullw+pmulhw.
+ // Check for legality
// pmullw/pmulhw are not supported by SSE.
- if (Subtarget.hasSSE41() || !Subtarget.hasSSE2())
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
+ // Check for profitability
+ // pmulld is supported since SSE41. It is better to use pmulld
+ // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
+ // the expansion.
+ bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+ if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
return SDValue();
ShrinkMode Mode;
@@ -27591,7 +30099,12 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// <4 x i16> undef).
//
// Legalize the operands of mul.
- SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
+ // FIXME: We may be able to handle non-concatenated vectors by insertion.
+ unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
+ if ((RegSize % ReducedSizeInBits) != 0)
+ return SDValue();
+
+ SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
DAG.getUNDEF(ReducedVT));
Ops[0] = NewN0;
NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
@@ -27851,7 +30364,7 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
const APInt &ShiftAmt = AmtSplat->getAPIntValue();
unsigned MaxAmount =
- VT.getSimpleVT().getVectorElementType().getSizeInBits();
+ VT.getSimpleVT().getScalarSizeInBits();
// SSE2/AVX2 logical shifts always return a vector of 0s
// if the shift amount is bigger than or equal to
@@ -27883,6 +30396,45 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
+ "Unexpected opcode");
+ EVT VT = N->getValueType(0);
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+ // This fails for mask register (vXi1) shifts.
+ if ((NumBitsPerElt % 8) != 0)
+ return SDValue();
+
+ // Out of range logical bit shifts are guaranteed to be zero.
+ APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+ if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+
+ // Shift N0 by zero -> N0.
+ if (!ShiftVal)
+ return N->getOperand(0);
+
+ // Shift zero -> zero.
+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+
+ // We can decode 'whole byte' logical bit shifts as shuffles.
+ if ((ShiftVal.getZExtValue() % 8) == 0) {
+ SDValue Op(N, 0);
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget))
+ return SDValue(); // This routine will use CombineTo to replace N.
+ }
+
+ return SDValue();
+}
+
/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
/// OR -> CMPNEQSS.
@@ -27943,7 +30495,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
if (Subtarget.hasAVX512()) {
- SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
+ SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
CMP01,
DAG.getConstant(x86cc, DL, MVT::i8));
if (N->getValueType(0) != MVT::i1)
@@ -27995,9 +30547,7 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
- if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
- VT != MVT::v8i64 && VT != MVT::v16i32 &&
- VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
+ if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
return SDValue();
// Canonicalize XOR to the left.
@@ -28111,95 +30661,6 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
}
}
-static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
- // A vector zext_in_reg may be represented as a shuffle,
- // feeding into a bitcast (this represents anyext) feeding into
- // an and with a mask.
- // We'd like to try to combine that into a shuffle with zero
- // plus a bitcast, removing the and.
- if (N0.getOpcode() != ISD::BITCAST ||
- N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
- return SDValue();
-
- // The other side of the AND should be a splat of 2^C, where C
- // is the number of bits in the source type.
- N1 = peekThroughBitcasts(N1);
- if (N1.getOpcode() != ISD::BUILD_VECTOR)
- return SDValue();
- BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
-
- ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
- EVT SrcType = Shuffle->getValueType(0);
-
- // We expect a single-source shuffle
- if (!Shuffle->getOperand(1)->isUndef())
- return SDValue();
-
- unsigned SrcSize = SrcType.getScalarSizeInBits();
- unsigned NumElems = SrcType.getVectorNumElements();
-
- APInt SplatValue, SplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- if (!Vector->isConstantSplat(SplatValue, SplatUndef,
- SplatBitSize, HasAnyUndefs))
- return SDValue();
-
- unsigned ResSize = N1.getValueType().getScalarSizeInBits();
- // Make sure the splat matches the mask we expect
- if (SplatBitSize > ResSize ||
- (SplatValue + 1).exactLogBase2() != (int)SrcSize)
- return SDValue();
-
- // Make sure the input and output size make sense
- if (SrcSize >= ResSize || ResSize % SrcSize)
- return SDValue();
-
- // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
- // The number of u's between each two values depends on the ratio between
- // the source and dest type.
- unsigned ZextRatio = ResSize / SrcSize;
- bool IsZext = true;
- for (unsigned i = 0; i != NumElems; ++i) {
- if (i % ZextRatio) {
- if (Shuffle->getMaskElt(i) > 0) {
- // Expected undef
- IsZext = false;
- break;
- }
- } else {
- if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
- // Expected element number
- IsZext = false;
- break;
- }
- }
- }
-
- if (!IsZext)
- return SDValue();
-
- // Ok, perform the transformation - replace the shuffle with
- // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
- // (instead of undef) where the k elements come from the zero vector.
- SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != NumElems; ++i)
- if (i % ZextRatio)
- Mask.push_back(NumElems);
- else
- Mask.push_back(i / ZextRatio);
-
- SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
- Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
- return DAG.getBitcast(N0.getValueType(), NewShuffle);
-}
-
/// If both input operands of a logic op are being cast from floating point
/// types, try to convert this into a floating point logic node to avoid
/// unnecessary moves from SSE to integer registers.
@@ -28255,7 +30716,7 @@ static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
// masked compare nodes, so they should not make it here.
EVT VT0 = Op0.getValueType();
EVT VT1 = Op1.getValueType();
- unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
+ unsigned EltBitWidth = VT0.getScalarSizeInBits();
if (VT0 != VT1 || EltBitWidth == 8)
return SDValue();
@@ -28277,9 +30738,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
- if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
- return Zext;
-
if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;
@@ -28297,6 +30755,17 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
+ // Attempt to recursively combine a bitmask AND with shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget))
+ return SDValue(); // This routine will use CombineTo to replace N.
+ }
+
// Create BEXTR instructions
// BEXTR is ((X >> imm) & (2**size-1))
if (VT != MVT::i32 && VT != MVT::i64)
@@ -28372,7 +30841,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
// Validate that the Mask operand is a vector sra node.
// FIXME: what to do for bytes, since there is a psignb/pblendvb, but
// there is no psrai.b
- unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+ unsigned EltBits = MaskVT.getScalarSizeInBits();
unsigned SraAmt = ~0;
if (Mask.getOpcode() == ISD::SRA) {
if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
@@ -28450,6 +30919,114 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(VT, Mask);
}
+// Helper function for combineOrCmpEqZeroToCtlzSrl
+// Transforms:
+// seteq(cmp x, 0)
+// into:
+// srl(ctlz x), log2(bitsize(x))
+// Input pattern is checked by caller.
+static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
+ SelectionDAG &DAG) {
+ SDValue Cmp = Op.getOperand(1);
+ EVT VT = Cmp.getOperand(0).getValueType();
+ unsigned Log2b = Log2_32(VT.getSizeInBits());
+ SDLoc dl(Op);
+ SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
+ // The result of the shift is true or false, and on X86, the 32-bit
+ // encoding of shr and lzcnt is more desirable.
+ SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
+ SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
+ DAG.getConstant(Log2b, dl, VT));
+ return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
+}
+
+// Try to transform:
+// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
+// into:
+// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
+// Will also attempt to match more generic cases, eg:
+// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
+// Only applies if the target supports the FastLZCNT feature.
+static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
+ return SDValue();
+
+ auto isORCandidate = [](SDValue N) {
+ return (N->getOpcode() == ISD::OR && N->hasOneUse());
+ };
+
+ // Check the zero extend is extending to 32-bit or more. The code generated by
+ // srl(ctlz) for 16-bit or less variants of the pattern would require extra
+ // instructions to clear the upper bits.
+ if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
+ !isORCandidate(N->getOperand(0)))
+ return SDValue();
+
+ // Check the node matches: setcc(eq, cmp 0)
+ auto isSetCCCandidate = [](SDValue N) {
+ return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
+ X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
+ N->getOperand(1).getOpcode() == X86ISD::CMP &&
+ N->getOperand(1).getConstantOperandVal(1) == 0 &&
+ N->getOperand(1).getValueType().bitsGE(MVT::i32);
+ };
+
+ SDNode *OR = N->getOperand(0).getNode();
+ SDValue LHS = OR->getOperand(0);
+ SDValue RHS = OR->getOperand(1);
+
+ // Save nodes matching or(or, setcc(eq, cmp 0)).
+ SmallVector<SDNode *, 2> ORNodes;
+ while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
+ (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
+ ORNodes.push_back(OR);
+ OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
+ LHS = OR->getOperand(0);
+ RHS = OR->getOperand(1);
+ }
+
+ // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
+ if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
+ !isORCandidate(SDValue(OR, 0)))
+ return SDValue();
+
+ // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
+ // to
+ // or(srl(ctlz),srl(ctlz)).
+ // The dag combiner can then fold it into:
+ // srl(or(ctlz, ctlz)).
+ EVT VT = OR->getValueType(0);
+ SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
+ SDValue Ret, NewRHS;
+ if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
+ Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
+
+ if (!Ret)
+ return SDValue();
+
+ // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
+ while (ORNodes.size() > 0) {
+ OR = ORNodes.pop_back_val();
+ LHS = OR->getOperand(0);
+ RHS = OR->getOperand(1);
+ // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
+ if (RHS->getOpcode() == ISD::OR)
+ std::swap(LHS, RHS);
+ EVT VT = OR->getValueType(0);
+ SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+ if (!NewRHS)
+ return SDValue();
+ Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
+ }
+
+ if (Ret)
+ Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
+
+ return Ret;
+}
+
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -28505,18 +31082,23 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
unsigned Opc = X86ISD::SHLD;
SDValue Op0 = N0.getOperand(0);
SDValue Op1 = N1.getOperand(0);
- if (ShAmt0.getOpcode() == ISD::SUB) {
+ if (ShAmt0.getOpcode() == ISD::SUB ||
+ ShAmt0.getOpcode() == ISD::XOR) {
Opc = X86ISD::SHRD;
std::swap(Op0, Op1);
std::swap(ShAmt0, ShAmt1);
}
+ // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
+ // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
+ // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
+ // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
unsigned Bits = VT.getSizeInBits();
if (ShAmt1.getOpcode() == ISD::SUB) {
SDValue Sum = ShAmt1.getOperand(0);
if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
- if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
+ if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
ShAmt1Op1 = ShAmt1Op1.getOperand(0);
if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
return DAG.getNode(Opc, DL, VT,
@@ -28526,18 +31108,39 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
}
} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
- if (ShAmt0C &&
- ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
+ if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
return DAG.getNode(Opc, DL, VT,
N0.getOperand(0), N1.getOperand(0),
DAG.getNode(ISD::TRUNCATE, DL,
MVT::i8, ShAmt0));
+ } else if (ShAmt1.getOpcode() == ISD::XOR) {
+ SDValue Mask = ShAmt1.getOperand(1);
+ if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
+ unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
+ SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
+ if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
+ ShAmt1Op0 = ShAmt1Op0.getOperand(0);
+ if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
+ if (Op1.getOpcode() == InnerShift &&
+ isa<ConstantSDNode>(Op1.getOperand(1)) &&
+ Op1.getConstantOperandVal(1) == 1) {
+ return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ }
+ // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
+ if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
+ Op1.getOperand(0) == Op1.getOperand(1)) {
+ return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+ }
+ }
+ }
}
return SDValue();
}
-// Generate NEG and CMOV for integer abs.
+/// Generate NEG and CMOV for integer abs.
static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -28553,21 +31156,19 @@ static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
// and change it to SUB and CMOV.
if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
- N0.getOpcode() == ISD::ADD &&
- N0.getOperand(1) == N1 &&
- N1.getOpcode() == ISD::SRA &&
- N1.getOperand(0) == N0.getOperand(0))
- if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
- if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
- // Generate SUB & CMOV.
- SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
- DAG.getConstant(0, DL, VT), N0.getOperand(0));
-
- SDValue Ops[] = { N0.getOperand(0), Neg,
- DAG.getConstant(X86::COND_GE, DL, MVT::i8),
- SDValue(Neg.getNode(), 1) };
- return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
- }
+ N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+ N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
+ auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+ if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+ // Generate SUB & CMOV.
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+ DAG.getConstant(0, DL, VT), N0.getOperand(0));
+ SDValue Ops[] = {N0.getOperand(0), Neg,
+ DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1)};
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
+ }
+ }
return SDValue();
}
@@ -28671,28 +31272,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
}
-static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
- return Cmp;
-
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
- return RV;
-
- if (Subtarget.hasCMov())
- if (SDValue RV = combineIntegerAbs(N, DAG))
- return RV;
-
- if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
- return FPLogic;
-
- return SDValue();
-}
-
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
/// X86ISD::AVG instruction.
@@ -28717,7 +31296,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
if (!Subtarget.hasSSE2())
return SDValue();
- if (Subtarget.hasAVX512()) {
+ if (Subtarget.hasBWI()) {
if (VT.getSizeInBits() > 512)
return SDValue();
} else if (Subtarget.hasAVX2()) {
@@ -28999,6 +31578,11 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+
+ // TODO: Expanding load with constant mask may be optimized as well.
+ if (Mld->isExpandingLoad())
+ return SDValue();
+
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
return ScalarLoad;
@@ -29018,8 +31602,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
SDLoc dl(Mld);
assert(LdVT != VT && "Cannot extend to the same type");
- unsigned ToSz = VT.getVectorElementType().getSizeInBits();
- unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = VT.getScalarSizeInBits();
+ unsigned FromSz = LdVT.getScalarSizeInBits();
// From/To sizes and ElemCount must be pow of two.
assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
"Unexpected size for extending masked load");
@@ -29114,6 +31698,10 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+
+ if (Mst->isCompressingStore())
+ return SDValue();
+
if (!Mst->isTruncatingStore())
return reduceMaskedStoreToScalarStore(Mst, DAG);
@@ -29124,8 +31712,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
SDLoc dl(Mst);
assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromSz = VT.getVectorElementType().getSizeInBits();
- unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+ unsigned FromSz = VT.getScalarSizeInBits();
+ unsigned ToSz = StVT.getScalarSizeInBits();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -29253,8 +31841,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElems = VT.getVectorNumElements();
assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromSz = VT.getVectorElementType().getSizeInBits();
- unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+ unsigned FromSz = VT.getScalarSizeInBits();
+ unsigned ToSz = StVT.getScalarSizeInBits();
// The truncating store is legal in some cases. For example
// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
@@ -29596,6 +32184,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
+/// the codegen.
+/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
+static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDLoc &DL) {
+ assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
+ SDValue Src = N->getOperand(0);
+ unsigned Opcode = Src.getOpcode();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+
+ auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
+ // TODO: Add extra cases where we can truncate both inputs for the
+ // cost of one (or none).
+ // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+ if (Op0 == Op1)
+ return true;
+
+ SDValue BC0 = peekThroughOneUseBitcasts(Op0);
+ SDValue BC1 = peekThroughOneUseBitcasts(Op1);
+ return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
+ ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
+ };
+
+ auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
+ SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+ return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+ };
+
+ // Don't combine if the operation has other uses.
+ if (!N->isOnlyUserOf(Src.getNode()))
+ return SDValue();
+
+ // Only support vector truncation for now.
+ // TODO: i64 scalar math would benefit as well.
+ if (!VT.isVector())
+ return SDValue();
+
+ // In most cases its only worth pre-truncating if we're only facing the cost
+ // of one truncation.
+ // i.e. if one of the inputs will constant fold or the input is repeated.
+ switch (Opcode) {
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+ IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+
+ case ISD::MUL:
+ // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
+ // better to truncate if we have the chance.
+ if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
+ !TLI.isOperationLegal(Opcode, SrcVT))
+ return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
+ LLVM_FALLTHROUGH;
+ case ISD::ADD: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegal(Opcode, VT) &&
+ IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
static SDValue
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
@@ -29653,7 +32318,8 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
static SDValue
-combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG,
SmallVector<SDValue, 8> &Regs) {
assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
EVT OutVT = N->getValueType(0);
@@ -29662,8 +32328,10 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
for (auto &Reg : Regs) {
- Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
- Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
+ Subtarget, DAG);
+ Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
+ Subtarget, DAG);
}
for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
@@ -29681,7 +32349,7 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
/// legalization the truncation will be translated into a BUILD_VECTOR with each
/// element that is extracted from a vector and then truncated, and it is
-/// diffcult to do this optimization based on them.
+/// difficult to do this optimization based on them.
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT OutVT = N->getValueType(0);
@@ -29732,17 +32400,60 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
else if (InSVT == MVT::i32)
- return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+ return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
else
return SDValue();
}
+/// This function transforms vector truncation of 'all or none' bits values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
+static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Requires SSE2 but AVX512 has fast truncate.
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ return SDValue();
+
+ if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
+ return SDValue();
+
+ SDValue In = N->getOperand(0);
+ if (!In.getValueType().isSimple())
+ return SDValue();
+
+ MVT VT = N->getValueType(0).getSimpleVT();
+ MVT SVT = VT.getScalarType();
+
+ MVT InVT = In.getValueType().getSimpleVT();
+ MVT InSVT = InVT.getScalarType();
+
+ // Use PACKSS if the input is a splatted sign bit.
+ // e.g. Comparison result, sext_in_reg, etc.
+ unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+ if (NumSignBits != InSVT.getSizeInBits())
+ return SDValue();
+
+ // Check we have a truncation suited for PACKSS.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+ if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
+ return SDValue();
+ if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
+ return SDValue();
+
+ return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
+}
+
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
SDLoc DL(N);
+ // Attempt to pre-truncate inputs to arithmetic ops instead.
+ if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
+ return V;
+
// Try to detect AVG pattern first.
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
@@ -29755,15 +32466,75 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
+ // Try to truncate extended sign bits with PACKSS.
+ if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
+ return V;
+
return combineVectorTruncation(N, DAG, Subtarget);
}
+/// Returns the negated value if the node \p N flips sign of FP value.
+///
+/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
+/// AVX512F does not have FXOR, so FNEG is lowered as
+/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
+/// In this case we go though all bitcasts.
+static SDValue isFNEG(SDNode *N) {
+ if (N->getOpcode() == ISD::FNEG)
+ return N->getOperand(0);
+
+ SDValue Op = peekThroughBitcasts(SDValue(N, 0));
+ if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
+ return SDValue();
+
+ SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
+ if (!Op1.getValueType().isFloatingPoint())
+ return SDValue();
+
+ SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
+
+ unsigned EltBits = Op1.getScalarValueSizeInBits();
+ auto isSignBitValue = [&](const ConstantFP *C) {
+ return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
+ };
+
+ // There is more than one way to represent the same constant on
+ // the different X86 targets. The type of the node may also depend on size.
+ // - load scalar value and broadcast
+ // - BUILD_VECTOR node
+ // - load from a constant pool.
+ // We check all variants here.
+ if (Op1.getOpcode() == X86ISD::VBROADCAST) {
+ if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
+ if (isSignBitValue(cast<ConstantFP>(C)))
+ return Op0;
+
+ } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
+ if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
+ if (isSignBitValue(CN->getConstantFPValue()))
+ return Op0;
+
+ } else if (auto *C = getTargetConstantFromNode(Op1)) {
+ if (C->getType()->isVectorTy()) {
+ if (auto *SplatV = C->getSplatValue())
+ if (isSignBitValue(cast<ConstantFP>(SplatV)))
+ return Op0;
+ } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
+ if (isSignBitValue(FPConst))
+ return Op0;
+ }
+ return SDValue();
+}
+
/// Do target-specific dag combines on floating point negations.
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- EVT VT = N->getValueType(0);
+ EVT OrigVT = N->getValueType(0);
+ SDValue Arg = isFNEG(N);
+ assert(Arg.getNode() && "N is expected to be an FNEG node");
+
+ EVT VT = Arg.getValueType();
EVT SVT = VT.getScalarType();
- SDValue Arg = N->getOperand(0);
SDLoc DL(N);
// Let legalize expand this if it isn't a legal type yet.
@@ -29776,70 +32547,182 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
- return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
- Arg.getOperand(1), Zero);
+ SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+ Arg.getOperand(1), Zero);
+ return DAG.getBitcast(OrigVT, NewNode);
}
- // If we're negating a FMA node, then we can adjust the
+ // If we're negating an FMA node, then we can adjust the
// instruction to include the extra negation.
+ unsigned NewOpcode = 0;
if (Arg.hasOneUse()) {
switch (Arg.getOpcode()) {
- case X86ISD::FMADD:
- return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
- Arg.getOperand(1), Arg.getOperand(2));
- case X86ISD::FMSUB:
- return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
- Arg.getOperand(1), Arg.getOperand(2));
- case X86ISD::FNMADD:
- return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
- Arg.getOperand(1), Arg.getOperand(2));
- case X86ISD::FNMSUB:
- return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
- Arg.getOperand(1), Arg.getOperand(2));
- }
- }
+ case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
+ case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
+ // We can't handle scalar intrinsic node here because it would only
+ // invert one element and not the whole vector. But we could try to handle
+ // a negation of the lower element only.
+ }
+ }
+ if (NewOpcode)
+ return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
+ Arg.getNode()->ops()));
+
return SDValue();
}
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- EVT VT = N->getValueType(0);
- if (VT.is512BitVector() && !Subtarget.hasDQI()) {
- // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
- // These logic operations may be executed in the integer domain.
+ const X86Subtarget &Subtarget) {
+ MVT VT = N->getSimpleValueType(0);
+ // If we have integer vector types available, use the integer opcodes.
+ if (VT.isVector() && Subtarget.hasSSE2()) {
SDLoc dl(N);
- MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
- MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+ MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
- unsigned IntOpcode = 0;
+ unsigned IntOpcode;
switch (N->getOpcode()) {
- default: llvm_unreachable("Unexpected FP logic op");
- case X86ISD::FOR: IntOpcode = ISD::OR; break;
- case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
- case X86ISD::FAND: IntOpcode = ISD::AND; break;
- case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+ default: llvm_unreachable("Unexpected FP logic op");
+ case X86ISD::FOR: IntOpcode = ISD::OR; break;
+ case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+ case X86ISD::FAND: IntOpcode = ISD::AND; break;
+ case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
}
SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
return DAG.getBitcast(VT, IntOp);
}
return SDValue();
}
+
+static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+ return Cmp;
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+ return RV;
+
+ if (Subtarget.hasCMov())
+ if (SDValue RV = combineIntegerAbs(N, DAG))
+ return RV;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ if (isFNEG(N))
+ return combineFneg(N, DAG, Subtarget);
+ return SDValue();
+}
+
+
+static bool isNullFPScalarOrVectorConst(SDValue V) {
+ return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
+}
+
+/// If a value is a scalar FP zero or a vector FP zero (potentially including
+/// undefined elements), return a zero constant that may be used to fold away
+/// that value. In the case of a vector, the returned constant will not contain
+/// undefined elements even if the input parameter does. This makes it suitable
+/// to be used as a replacement operand with operations (eg, bitwise-and) where
+/// an undef should not propagate.
+static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!isNullFPScalarOrVectorConst(V))
+ return SDValue();
+
+ if (V.getValueType().isVector())
+ return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
+
+ return V;
+}
+
+static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
+ if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::f64 && Subtarget.hasSSE2())))
+ return SDValue();
+
+ auto isAllOnesConstantFP = [](SDValue V) {
+ auto *C = dyn_cast<ConstantFPSDNode>(V);
+ return C && C->getConstantFPValue()->isAllOnesValue();
+ };
+
+ // fand (fxor X, -1), Y --> fandn X, Y
+ if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
+ return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
+
+ // fand X, (fxor Y, -1) --> fandn Y, X
+ if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
+ return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
+
+ return SDValue();
+}
+
+/// Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // FAND(0.0, x) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
+ return V;
+
+ // FAND(x, 0.0) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
+ return V;
+
+ if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
+ return V;
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FANDN nodes.
+static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // FANDN(0.0, x) -> x
+ if (isNullFPScalarOrVectorConst(N->getOperand(0)))
+ return N->getOperand(1);
+
+ // FANDN(x, 0.0) -> 0.0
+ if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
+ return V;
+
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
// F[X]OR(0.0, x) -> x
- if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
- if (C->getValueAPF().isPosZero())
- return N->getOperand(1);
+ if (isNullFPScalarOrVectorConst(N->getOperand(0)))
+ return N->getOperand(1);
// F[X]OR(x, 0.0) -> x
- if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
- if (C->getValueAPF().isPosZero())
- return N->getOperand(0);
+ if (isNullFPScalarOrVectorConst(N->getOperand(1)))
+ return N->getOperand(0);
+
+ if (isFNEG(N))
+ if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
+ return NewVal;
return lowerX86FPLogicOp(N, DAG, Subtarget);
}
@@ -29921,38 +32804,6 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
}
-/// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- // FAND(0.0, x) -> 0.0
- if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
- if (C->getValueAPF().isPosZero())
- return N->getOperand(0);
-
- // FAND(x, 0.0) -> 0.0
- if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
- if (C->getValueAPF().isPosZero())
- return N->getOperand(1);
-
- return lowerX86FPLogicOp(N, DAG, Subtarget);
-}
-
-/// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- // FANDN(0.0, x) -> x
- if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
- if (C->getValueAPF().isPosZero())
- return N->getOperand(1);
-
- // FANDN(x, 0.0) -> 0.0
- if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
- if (C->getValueAPF().isPosZero())
- return N->getOperand(1);
-
- return lowerX86FPLogicOp(N, DAG, Subtarget);
-}
-
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
// BT ignores high bits in the bit index operand.
@@ -29971,17 +32822,6 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
- SDValue Op = peekThroughBitcasts(N->getOperand(0));
- EVT VT = N->getValueType(0), OpVT = Op.getValueType();
- if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
- VT.getVectorElementType().getSizeInBits() ==
- OpVT.getVectorElementType().getSizeInBits()) {
- return DAG.getBitcast(VT, Op);
- }
- return SDValue();
-}
-
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
@@ -30018,19 +32858,32 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
}
/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
-/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
-/// to combine math ops, use an LEA, or use a complex addressing mode. This can
-/// eliminate extend, add, and shift instructions.
-static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
+/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
+/// opportunities to combine math ops, use an LEA, or use a complex addressing
+/// mode. This can eliminate extend, add, and shift instructions.
+static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
+ Ext->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+
// TODO: This should be valid for other integer types.
- EVT VT = Sext->getValueType(0);
+ EVT VT = Ext->getValueType(0);
if (VT != MVT::i64)
return SDValue();
- // We need an 'add nsw' feeding into the 'sext'.
- SDValue Add = Sext->getOperand(0);
- if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
+ SDValue Add = Ext->getOperand(0);
+ if (Add.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
+ bool NSW = Add->getFlags()->hasNoSignedWrap();
+ bool NUW = Add->getFlags()->hasNoUnsignedWrap();
+
+ // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
+ // into the 'zext'
+ if ((Sext && !NSW) || (!Sext && !NUW))
return SDValue();
// Having a constant operand to the 'add' ensures that we are not increasing
@@ -30046,7 +32899,7 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
// of single 'add' instructions, but the cost model for selecting an LEA
// currently has a high threshold.
bool HasLEAPotential = false;
- for (auto *User : Sext->uses()) {
+ for (auto *User : Ext->uses()) {
if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
HasLEAPotential = true;
break;
@@ -30055,17 +32908,18 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
if (!HasLEAPotential)
return SDValue();
- // Everything looks good, so pull the 'sext' ahead of the 'add'.
- int64_t AddConstant = AddOp1->getSExtValue();
+ // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
+ int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
SDValue AddOp0 = Add.getOperand(0);
- SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
+ SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
// The wider add is guaranteed to not wrap because both operands are
// sign-extended.
SDNodeFlags Flags;
- Flags.setNoSignedWrap(true);
- return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
+ Flags.setNoSignedWrap(NSW);
+ Flags.setNoUnsignedWrap(NUW);
+ return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
}
/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
@@ -30157,18 +33011,17 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
// ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
// Also use this if we don't have SSE41 to allow the legalizer do its job.
if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
- (VT.is256BitVector() && Subtarget.hasInt256())) {
+ (VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.hasAVX512())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
return Opcode == ISD::SIGN_EXTEND
? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
}
- // On pre-AVX2 targets, split into 128-bit nodes of
- // ISD::*_EXTEND_VECTOR_INREG.
- if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
- unsigned NumVecs = VT.getSizeInBits() / 128;
- unsigned NumSubElts = 128 / SVT.getSizeInBits();
+ auto SplitAndExtendInReg = [&](unsigned SplitSize) {
+ unsigned NumVecs = VT.getSizeInBits() / SplitSize;
+ unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
@@ -30176,14 +33029,24 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
DAG.getIntPtrConstant(Offset, DL));
- SrcVec = ExtendVecSize(DL, SrcVec, 128);
+ SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
SrcVec = Opcode == ISD::SIGN_EXTEND
? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
Opnds.push_back(SrcVec);
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
- }
+ };
+
+ // On pre-AVX2 targets, split into 128-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
+ return SplitAndExtendInReg(128);
+
+ // On pre-AVX512 targets, split into 256-bit nodes of
+ // ISD::*_EXTEND_VECTOR_INREG.
+ if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
+ return SplitAndExtendInReg(256);
return SDValue();
}
@@ -30216,7 +33079,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
return R;
- if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
+ if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
return NewAdd;
return SDValue();
@@ -30239,26 +33102,58 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
SDValue B = N->getOperand(1);
SDValue C = N->getOperand(2);
- bool NegA = (A.getOpcode() == ISD::FNEG);
- bool NegB = (B.getOpcode() == ISD::FNEG);
- bool NegC = (C.getOpcode() == ISD::FNEG);
+ auto invertIfNegative = [](SDValue &V) {
+ if (SDValue NegVal = isFNEG(V.getNode())) {
+ V = NegVal;
+ return true;
+ }
+ return false;
+ };
+
+ // Do not convert the passthru input of scalar intrinsics.
+ // FIXME: We could allow negations of the lower element only.
+ bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+ bool NegB = invertIfNegative(B);
+ bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
// Negative multiplication when NegA xor NegB
bool NegMul = (NegA != NegB);
- if (NegA)
- A = A.getOperand(0);
- if (NegB)
- B = B.getOperand(0);
- if (NegC)
- C = C.getOperand(0);
- unsigned Opcode;
+ unsigned NewOpcode;
if (!NegMul)
- Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+ NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
else
- Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+ NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
+
+ if (N->getOpcode() == X86ISD::FMADD_RND) {
+ switch (NewOpcode) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
+ }
+ } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
+ switch (NewOpcode) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
+ }
+ } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
+ switch (NewOpcode) {
+ case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
+ }
+ } else {
+ assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
+ "Unexpected opcode!");
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ }
- return DAG.getNode(Opcode, dl, VT, A, B, C);
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
}
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
@@ -30308,6 +33203,12 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (SDValue DivRem8 = getDivRem8(N, DAG))
return DivRem8;
+ if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
+ return NewAdd;
+
+ if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
+ return R;
+
return SDValue();
}
@@ -30443,10 +33344,8 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
// Try to simplify the EFLAGS and condition code operands.
- if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
- SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
- return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
- }
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
+ return getSETCC(CC, Flags, DL, DAG);
return SDValue();
}
@@ -30539,6 +33438,12 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
+ // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+ // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+ // the optimization here.
+ if (DAG.SignBitIsZero(Op0))
+ return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+
return SDValue();
}
@@ -30555,9 +33460,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
EVT InVT = Op0.getValueType();
EVT InSVT = InVT.getScalarType();
+ // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
- if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+ if (InVT.isVector() &&
+ (InSVT == MVT::i8 || InSVT == MVT::i16 ||
+ (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
SDLoc dl(N);
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements());
@@ -30565,6 +33473,23 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
+ // Without AVX512DQ we only support i64 to float scalar conversion. For both
+ // vectors and scalars, see if we know that the upper bits are all the sign
+ // bit, in which case we can truncate the input to i32 and convert from that.
+ if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
+ unsigned BitWidth = InVT.getScalarSizeInBits();
+ unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
+ if (NumSignBits >= (BitWidth - 31)) {
+ EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
+ if (InVT.isVector())
+ TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
+ InVT.getVectorNumElements());
+ SDLoc dl(N);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+ }
+ }
+
// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
// a 32-bit target where SSE doesn't support i64->FP operations.
if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
@@ -30654,13 +33579,15 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
}
-static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
+ // TODO: There's nothing special about i32, any integer type above i16 should
+ // work just as well.
if (!VT.isVector() || !VT.isSimple() ||
!(VT.getVectorElementType() == MVT::i32))
return SDValue();
@@ -30672,24 +33599,13 @@ static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
RegSize = 256;
// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+ // TODO: We should be able to handle larger vectors by splitting them before
+ // feeding them into several SADs, and then reducing over those.
if (VT.getSizeInBits() / 4 > RegSize)
return SDValue();
- // Detect the following pattern:
- //
- // 1: %2 = zext <N x i8> %0 to <N x i32>
- // 2: %3 = zext <N x i8> %1 to <N x i32>
- // 3: %4 = sub nsw <N x i32> %2, %3
- // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
- // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
- // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
- // 7: %8 = add nsw <N x i32> %7, %vec.phi
- //
- // The last instruction must be a reduction add. The instructions 3-6 forms an
- // ABSDIFF pattern.
-
- // The two operands of reduction add are from PHI and a select-op as in line 7
- // above.
+ // We know N is a reduction add, which means one of its operands is a phi.
+ // To match SAD, we need the other operand to be a vector select.
SDValue SelectOp, Phi;
if (Op0.getOpcode() == ISD::VSELECT) {
SelectOp = Op0;
@@ -30700,77 +33616,22 @@ static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
} else
return SDValue();
- // Check the condition of the select instruction is greater-than.
- SDValue SetCC = SelectOp->getOperand(0);
- if (SetCC.getOpcode() != ISD::SETCC)
- return SDValue();
- ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
- if (CC != ISD::SETGT)
- return SDValue();
-
- Op0 = SelectOp->getOperand(1);
- Op1 = SelectOp->getOperand(2);
-
- // The second operand of SelectOp Op1 is the negation of the first operand
- // Op0, which is implemented as 0 - Op0.
- if (!(Op1.getOpcode() == ISD::SUB &&
- ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
- Op1.getOperand(1) == Op0))
- return SDValue();
-
- // The first operand of SetCC is the first operand of SelectOp, which is the
- // difference between two input vectors.
- if (SetCC.getOperand(0) != Op0)
- return SDValue();
-
- // The second operand of > comparison can be either -1 or 0.
- if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
- ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
- return SDValue();
-
- // The first operand of SelectOp is the difference between two input vectors.
- if (Op0.getOpcode() != ISD::SUB)
- return SDValue();
-
- Op1 = Op0.getOperand(1);
- Op0 = Op0.getOperand(0);
-
- // Check if the operands of the diff are zero-extended from vectors of i8.
- if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
- Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
- Op1.getOpcode() != ISD::ZERO_EXTEND ||
- Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+ // Check whether we have an abs-diff pattern feeding into the select.
+ if(!detectZextAbsDiff(SelectOp, Op0, Op1))
return SDValue();
// SAD pattern detected. Now build a SAD instruction and an addition for
- // reduction. Note that the number of elments of the result of SAD is less
+ // reduction. Note that the number of elements of the result of SAD is less
// than the number of elements of its input. Therefore, we could only update
// part of elements in the reduction vector.
-
- // Legalize the type of the inputs of PSADBW.
- EVT InVT = Op0.getOperand(0).getValueType();
- if (InVT.getSizeInBits() <= 128)
- RegSize = 128;
- else if (InVT.getSizeInBits() <= 256)
- RegSize = 256;
-
- unsigned NumConcat = RegSize / InVT.getSizeInBits();
- SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
- Ops[0] = Op0.getOperand(0);
- MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
- Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
- Ops[0] = Op1.getOperand(0);
- Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+ SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
// The output of PSADBW is a vector of i64.
- MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
- SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
-
// We need to turn the vector of i64 into a vector of i32.
// If the reduction vector is at least as wide as the psadbw result, just
// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
// anyway.
- MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+ MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
if (VT.getSizeInBits() >= ResVT.getSizeInBits())
Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
else
@@ -30793,7 +33654,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
if (Flags->hasVectorReduction()) {
- if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
+ if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
return Sad;
}
EVT VT = N->getValueType(0);
@@ -30832,20 +33693,21 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
}
}
- // Try to synthesize horizontal adds from adds of shuffles.
+ // Try to synthesize horizontal subs from subs of shuffles.
EVT VT = N->getValueType(0);
if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
(Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
- isHorizontalBinOp(Op0, Op1, true))
+ isHorizontalBinOp(Op0, Op1, false))
return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
return OptimizeConditionalInDecrement(N, DAG);
}
-static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
+ unsigned Opcode = N->getOpcode();
MVT VT = N->getSimpleValueType(0);
MVT SVT = VT.getVectorElementType();
SDValue Op = N->getOperand(0);
@@ -30854,25 +33716,28 @@ static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
// Perform any constant folding.
+ // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
- SmallVector<SDValue, 4> Vals;
- for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ unsigned NumDstElts = VT.getVectorNumElements();
+ SmallBitVector Undefs(NumDstElts, false);
+ SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
+ for (unsigned i = 0; i != NumDstElts; ++i) {
SDValue OpElt = Op.getOperand(i);
if (OpElt.getOpcode() == ISD::UNDEF) {
- Vals.push_back(DAG.getUNDEF(SVT));
+ Undefs[i] = true;
continue;
}
APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
- assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
- Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
- Vals.push_back(DAG.getConstant(Cst, DL, SVT));
+ Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
+ : Cst.sextOrTrunc(SVT.getSizeInBits());
}
- return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
+ return getConstVector(Vals, Undefs, VT, DAG, DL);
}
// (vzext (bitcast (vzext (x)) -> (vzext x)
+ // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
SDValue V = peekThroughBitcasts(Op);
- if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
+ if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
MVT InnerVT = V.getSimpleValueType();
MVT InnerEltVT = InnerVT.getVectorElementType();
@@ -30897,7 +33762,9 @@ static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
// Check if we can bypass extracting and re-inserting an element of an input
// vector. Essentially:
// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
- if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ // TODO: Add X86ISD::VSEXT support
+ if (Opcode == X86ISD::VZEXT &&
+ V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
SDValue ExtractedV = V.getOperand(0);
@@ -30976,7 +33843,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default: break;
- case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case ISD::VSELECT:
case ISD::SELECT:
case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
@@ -31002,16 +33870,15 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
+ case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
+ case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
case X86ISD::FXOR:
case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
case X86ISD::FMIN:
case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMINNUM:
case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
- case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
- case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
case X86ISD::BT: return combineBT(N, DAG, DCI);
- case X86ISD::VZEXT_MOVL: return combineVZextMovl(N, DAG);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
@@ -31019,7 +33886,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
- case X86ISD::VZEXT: return combineVZext(N, DAG, DCI, Subtarget);
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
+ case X86ISD::VSEXT:
+ case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::INSERTPS:
case X86ISD::PALIGNR:
@@ -31043,11 +33913,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERMI:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
+ case X86ISD::VPERMIV3:
case X86ISD::VPERMIL2:
case X86ISD::VPERMILPI:
case X86ISD::VPERMILPV:
case X86ISD::VPERM2X128:
+ case X86ISD::VZEXT_MOVL:
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
+ case X86ISD::FMADD:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMADDS1_RND:
+ case X86ISD::FMADDS3_RND:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case ISD::MGATHER:
case ISD::MSCATTER: return combineGatherScatter(N, DAG);
@@ -31133,7 +34009,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
case ISD::OR:
case ISD::XOR:
Commute = true;
- // fallthrough
+ LLVM_FALLTHROUGH;
case ISD::SUB: {
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
@@ -31280,9 +34156,11 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
case 'u':
case 'y':
case 'x':
+ case 'v':
case 'Y':
case 'l':
return C_RegisterClass;
+ case 'k': // AVX512 masking registers.
case 'a':
case 'b':
case 'c':
@@ -31306,6 +34184,19 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
break;
}
}
+ else if (Constraint.size() == 2) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'Y':
+ switch (Constraint[1]) {
+ default:
+ break;
+ case 'k':
+ return C_Register;
+ }
+ }
+ }
return TargetLowering::getConstraintType(Constraint);
}
@@ -31349,12 +34240,28 @@ TargetLowering::ConstraintWeight
if (type->isX86_MMXTy() && Subtarget.hasMMX())
weight = CW_SpecificReg;
break;
- case 'x':
case 'Y':
+ // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
+ if (constraint[1] == 'k') {
+ // Support for 'Yk' (similarly to the 'k' variant below).
+ weight = CW_SpecificReg;
+ break;
+ }
+ // Else fall through (handle "Y" constraint).
+ LLVM_FALLTHROUGH;
+ case 'v':
+ if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
+ weight = CW_Register;
+ LLVM_FALLTHROUGH;
+ case 'x':
if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
weight = CW_Register;
break;
+ case 'k':
+ // Enable conditional vector operations using %k<#> registers.
+ weight = CW_SpecificReg;
+ break;
case 'I':
if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
if (C->getZExtValue() <= 31)
@@ -31601,60 +34508,21 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
/// Check if \p RC is a general purpose register class.
/// I.e., GR* or one of their variant.
static bool isGRClass(const TargetRegisterClass &RC) {
- switch (RC.getID()) {
- case X86::GR8RegClassID:
- case X86::GR8_ABCD_LRegClassID:
- case X86::GR8_ABCD_HRegClassID:
- case X86::GR8_NOREXRegClassID:
- case X86::GR16RegClassID:
- case X86::GR16_ABCDRegClassID:
- case X86::GR16_NOREXRegClassID:
- case X86::GR32RegClassID:
- case X86::GR32_ABCDRegClassID:
- case X86::GR32_TCRegClassID:
- case X86::GR32_NOREXRegClassID:
- case X86::GR32_NOAXRegClassID:
- case X86::GR32_NOSPRegClassID:
- case X86::GR32_NOREX_NOSPRegClassID:
- case X86::GR32_ADRegClassID:
- case X86::GR64RegClassID:
- case X86::GR64_ABCDRegClassID:
- case X86::GR64_TCRegClassID:
- case X86::GR64_TCW64RegClassID:
- case X86::GR64_NOREXRegClassID:
- case X86::GR64_NOSPRegClassID:
- case X86::GR64_NOREX_NOSPRegClassID:
- case X86::LOW32_ADDR_ACCESSRegClassID:
- case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
- return true;
- default:
- return false;
- }
+ return RC.hasSuperClassEq(&X86::GR8RegClass) ||
+ RC.hasSuperClassEq(&X86::GR16RegClass) ||
+ RC.hasSuperClassEq(&X86::GR32RegClass) ||
+ RC.hasSuperClassEq(&X86::GR64RegClass) ||
+ RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
}
/// Check if \p RC is a vector register class.
/// I.e., FR* / VR* or one of their variant.
static bool isFRClass(const TargetRegisterClass &RC) {
- switch (RC.getID()) {
- case X86::FR32RegClassID:
- case X86::FR32XRegClassID:
- case X86::FR64RegClassID:
- case X86::FR64XRegClassID:
- case X86::FR128RegClassID:
- case X86::VR64RegClassID:
- case X86::VR128RegClassID:
- case X86::VR128LRegClassID:
- case X86::VR128HRegClassID:
- case X86::VR128XRegClassID:
- case X86::VR256RegClassID:
- case X86::VR256LRegClassID:
- case X86::VR256HRegClassID:
- case X86::VR256XRegClassID:
- case X86::VR512RegClassID:
- return true;
- default:
- return false;
- }
+ return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
+ RC.hasSuperClassEq(&X86::FR64XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR128XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR256XRegClass) ||
+ RC.hasSuperClassEq(&X86::VR512RegClass);
}
std::pair<unsigned, const TargetRegisterClass *>
@@ -31670,6 +34538,24 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// TODO: Slight differences here in allocation order and leaving
// RIP in the class. Do they matter any more here than they do
// in the normal allocation?
+ case 'k':
+ if (Subtarget.hasAVX512()) {
+ // Only supported in AVX512 or later.
+ switch (VT.SimpleTy) {
+ default: break;
+ case MVT::i32:
+ return std::make_pair(0U, &X86::VK32RegClass);
+ case MVT::i16:
+ return std::make_pair(0U, &X86::VK16RegClass);
+ case MVT::i8:
+ return std::make_pair(0U, &X86::VK8RegClass);
+ case MVT::i1:
+ return std::make_pair(0U, &X86::VK1RegClass);
+ case MVT::i64:
+ return std::make_pair(0U, &X86::VK64RegClass);
+ }
+ }
+ break;
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
if (Subtarget.is64Bit()) {
if (VT == MVT::i32 || VT == MVT::f32)
@@ -31723,18 +34609,24 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::VR64RegClass);
case 'Y': // SSE_REGS if SSE2 allowed
if (!Subtarget.hasSSE2()) break;
- // FALL THROUGH.
+ LLVM_FALLTHROUGH;
+ case 'v':
case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
if (!Subtarget.hasSSE1()) break;
+ bool VConstraint = (Constraint[0] == 'v');
switch (VT.SimpleTy) {
default: break;
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
+ if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::FR32XRegClass);
return std::make_pair(0U, &X86::FR32RegClass);
case MVT::f64:
case MVT::i64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::FR64XRegClass);
return std::make_pair(0U, &X86::FR64RegClass);
// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
// Vector types.
@@ -31744,6 +34636,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR128XRegClass);
return std::make_pair(0U, &X86::VR128RegClass);
// AVX types.
case MVT::v32i8:
@@ -31752,6 +34646,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case MVT::v4i64:
case MVT::v8f32:
case MVT::v4f64:
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR256XRegClass);
return std::make_pair(0U, &X86::VR256RegClass);
case MVT::v8f64:
case MVT::v16f32:
@@ -31761,6 +34657,29 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
break;
}
+ } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
+ switch (Constraint[1]) {
+ default:
+ break;
+ case 'k':
+ // This register class doesn't allocate k0 for masked vector operation.
+ if (Subtarget.hasAVX512()) { // Only supported in AVX512.
+ switch (VT.SimpleTy) {
+ default: break;
+ case MVT::i32:
+ return std::make_pair(0U, &X86::VK32WMRegClass);
+ case MVT::i16:
+ return std::make_pair(0U, &X86::VK16WMRegClass);
+ case MVT::i8:
+ return std::make_pair(0U, &X86::VK8WMRegClass);
+ case MVT::i1:
+ return std::make_pair(0U, &X86::VK1WMRegClass);
+ case MVT::i64:
+ return std::make_pair(0U, &X86::VK64WMRegClass);
+ }
+ }
+ break;
+ }
}
// Use the default implementation in TargetLowering to convert the register
@@ -31954,3 +34873,7 @@ void X86TargetLowering::insertCopiesSplitCSR(
.addReg(NewVR);
}
}
+
+bool X86TargetLowering::supportSwiftError() const {
+ return Subtarget.is64Bit();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
index d826f1e..37f9353 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -95,7 +95,7 @@ namespace llvm {
SETCC,
/// X86 Select
- SELECT,
+ SELECT, SELECTS,
// Same as SETCC except it's materialized with a sbb and the value is all
// one's or all zero's.
@@ -106,6 +106,10 @@ namespace llvm {
/// 0s or 1s. Generally DTRT for C/C++ with NaNs.
FSETCC,
+ /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+ /// with optional rounding mode.
+ FSETCCM, FSETCCM_RND,
+
/// X86 conditional moves. Operand 0 and operand 1 are the two values
/// to select from. Operand 2 is the condition code, and operand 3 is the
/// flag operand produced by a CMP or TEST instruction. It also writes a
@@ -135,8 +139,9 @@ namespace llvm {
/// at function entry, used for PIC code.
GlobalBaseReg,
- /// A wrapper node for TargetConstantPool,
- /// TargetExternalSymbol, and TargetGlobalAddress.
+ /// A wrapper node for TargetConstantPool, TargetJumpTable,
+ /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+ /// MCSymbol and TargetBlockAddress.
Wrapper,
/// Special wrapper used under X86-64 PIC mode for RIP
@@ -205,12 +210,12 @@ namespace llvm {
FDIV_RND,
FMAX_RND,
FMIN_RND,
- FSQRT_RND,
+ FSQRT_RND, FSQRTS_RND,
// FP vector get exponent.
- FGETEXP_RND,
+ FGETEXP_RND, FGETEXPS_RND,
// Extract Normalized Mantissas.
- VGETMANT,
+ VGETMANT, VGETMANTS,
// FP Scale.
SCALEF,
SCALEFS,
@@ -251,7 +256,7 @@ namespace llvm {
/// in order to obtain suitable precision.
FRSQRT, FRCP,
FRSQRTS, FRCPS,
-
+
// Thread Local Storage.
TLSADDR,
@@ -293,13 +298,10 @@ namespace llvm {
VTRUNCUS, VTRUNCS,
// Vector FP extend.
- VFPEXT,
+ VFPEXT, VFPEXT_RND, VFPEXTS_RND,
// Vector FP round.
- VFPROUND,
-
- // Vector signed/unsigned integer to double.
- CVTDQ2PD, CVTUDQ2PD,
+ VFPROUND, VFPROUND_RND, VFPROUNDS_RND,
// Convert a vector to mask, set bits base on MSB.
CVT2MASK,
@@ -426,9 +428,9 @@ namespace llvm {
// Range Restriction Calculation For Packed Pairs of Float32/64 values.
VRANGE,
// Reduce - Perform Reduction Transformation on scalar\packed FP.
- VREDUCE,
+ VREDUCE, VREDUCES,
// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
- VRNDSCALE,
+ VRNDSCALE, VRNDSCALES,
// Tests Types Of a FP Values for packed types.
VFPCLASS,
// Tests Types Of a FP Values for scalar types.
@@ -486,19 +488,33 @@ namespace llvm {
FMADDSUB_RND,
FMSUBADD_RND,
+ // Scalar intrinsic FMA with rounding mode.
+ // Two versions, passthru bits on op1 or op3.
+ FMADDS1_RND, FMADDS3_RND,
+ FNMADDS1_RND, FNMADDS3_RND,
+ FMSUBS1_RND, FMSUBS3_RND,
+ FNMSUBS1_RND, FNMSUBS3_RND,
+
// Compress and expand.
COMPRESS,
EXPAND,
- // Convert Unsigned/Integer to Scalar Floating-Point Value
- // with rounding mode.
- SINT_TO_FP_RND,
- UINT_TO_FP_RND,
+ // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+ SINT_TO_FP_RND, UINT_TO_FP_RND,
+ SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
// Vector float/double to signed/unsigned integer.
- FP_TO_SINT_RND, FP_TO_UINT_RND,
+ CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
// Scalar float/double to signed/unsigned integer.
- SCALAR_FP_TO_SINT_RND, SCALAR_FP_TO_UINT_RND,
+ CVTS2SI_RND, CVTS2UI_RND,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND,
+ // Scalar float/double to signed/unsigned integer with truncation.
+ CVTTS2SI_RND, CVTTS2UI_RND,
+
+ // Vector signed/unsigned integer to float/double.
+ CVTSI2P, CVTUI2P,
// Save xmm argument registers to the stack, according to %al. An operator
// is needed so that this can be expanded with control flow.
@@ -537,7 +553,10 @@ namespace llvm {
XTEST,
// ERI instructions.
- RSQRT28, RCP28, EXP2,
+ RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
+
+ // Conversions between float and half-float.
+ CVTPS2PH, CVTPH2PS,
// Compare and swap.
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -587,7 +606,12 @@ namespace llvm {
/// This instruction grabs the address of the next argument
/// from a va_list. (reads and modifies the va_list in memory)
- VAARG_64
+ VAARG_64,
+
+ // Vector truncating store with unsigned/signed saturation
+ VTRUNCSTOREUS, VTRUNCSTORES,
+ // Vector truncating masked store with unsigned/signed saturation
+ VMTRUNCSTOREUS, VMTRUNCSTORES
// WARNING: Do not add anything in the end unless you want the node to
// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
@@ -760,10 +784,28 @@ namespace llvm {
bool isCheapToSpeculateCtlz() const override;
+ bool isCtlzFast() const override;
+
bool hasBitPreservingFPLogic(EVT VT) const override {
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
}
+ bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
+ // If the pair to store is a mixture of float and int values, we will
+ // save two bitwise instructions and one float-to-int instruction and
+ // increase one store instruction. There is potentially a more
+ // significant benefit because it avoids the float->int domain switch
+ // for input value. So It is more likely a win.
+ if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
+ (LTy.isInteger() && HTy.isFloatingPoint()))
+ return true;
+ // If the pair only contains int values, we will save two bitwise
+ // instructions and increase one store instruction (costing one more
+ // store buffer). Since the benefit is more blurred so we leave
+ // such pair out until we get testcase to prove it is a win.
+ return false;
+ }
+
bool hasAndNotCompare(SDValue Y) const override;
/// Return the value type to use for ISD::SETCC.
@@ -995,10 +1037,16 @@ namespace llvm {
bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
- bool supportSwiftError() const override {
- return true;
- }
+ bool supportSwiftError() const override;
+ unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+ /// \brief Lower interleaved load(s) into target specific
+ /// instructions/intrinsics.
+ bool lowerInterleavedLoad(LoadInst *LI,
+ ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices,
+ unsigned Factor) const override;
protected:
std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -1032,7 +1080,7 @@ namespace llvm {
SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::InputArg> &ArgInfo,
const SDLoc &dl, SelectionDAG &DAG,
- const CCValAssign &VA, MachineFrameInfo *MFI,
+ const CCValAssign &VA, MachineFrameInfo &MFI,
unsigned i) const;
SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
const SDLoc &dl, SelectionDAG &DAG,
@@ -1073,8 +1121,9 @@ namespace llvm {
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
-
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+
+ unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
@@ -1082,14 +1131,15 @@ namespace llvm {
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) const;
SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -1101,6 +1151,7 @@ namespace llvm {
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
@@ -1219,14 +1270,17 @@ namespace llvm {
/// Convert a comparison if required by the subtarget.
SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+ /// Check if replacement of SQRT with RSQRT should be disabled.
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
/// Use rsqrt* to speed up sqrt calculations.
- SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
- unsigned &RefinementSteps,
- bool &UseOneConstNR) const override;
+ SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps, bool &UseOneConstNR,
+ bool Reciprocal) const override;
/// Use rcp* to speed up fdiv calculations.
- SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
- unsigned &RefinementSteps) const override;
+ SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const override;
/// Reassociate floating point divisions into multiply by reciprocal.
unsigned combineRepeatedFPDivisors() const override;
@@ -1236,6 +1290,93 @@ namespace llvm {
FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo);
} // end namespace X86
+
+ // Base class for all X86 non-masked store operations.
+ class X86StoreSDNode : public MemSDNode {
+ public:
+ X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
+ SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
+ const SDValue &getValue() const { return getOperand(1); }
+ const SDValue &getBasePtr() const { return getOperand(2); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VTRUNCSTORES ||
+ N->getOpcode() == X86ISD::VTRUNCSTOREUS;
+ }
+ };
+
+ // Base class for all X86 masked store operations.
+ // The class has the same order of operands as MaskedStoreSDNode for
+ // convenience.
+ class X86MaskedStoreSDNode : public MemSDNode {
+ public:
+ X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
+
+ const SDValue &getBasePtr() const { return getOperand(1); }
+ const SDValue &getMask() const { return getOperand(2); }
+ const SDValue &getValue() const { return getOperand(3); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
+ N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
+ }
+ };
+
+ // X86 Truncating Store with Signed saturation.
+ class TruncSStoreSDNode : public X86StoreSDNode {
+ public:
+ TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
+ SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+ : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VTRUNCSTORES;
+ }
+ };
+
+ // X86 Truncating Store with Unsigned saturation.
+ class TruncUSStoreSDNode : public X86StoreSDNode {
+ public:
+ TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
+ SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+ : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
+ }
+ };
+
+ // X86 Truncating Masked Store with Signed saturation.
+ class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
+ public:
+ MaskedTruncSStoreSDNode(unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VMTRUNCSTORES;
+ }
+ };
+
+ // X86 Truncating Masked Store with Unsigned saturation.
+ class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
+ public:
+ MaskedTruncUSStoreSDNode(unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
+ }
+ };
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
index 803a7e3..230d170 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -77,15 +77,15 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
!if (!eq (TypeVariantName, "i"),
!if (!eq (Size, 128), "v2i64",
!if (!eq (Size, 256), "v4i64",
- VTName)), VTName));
+ !if (!eq (Size, 512), "v8i64",
+ VTName))), VTName));
PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
- !if (!eq (TypeVariantName, "i"),
- !if (!eq (Size, 128), "v2i64",
- !if (!eq (Size, 256), "v4i64",
- !if (!eq (Size, 512),
- !if (!eq (EltSize, 64), "v8i64", "v16i32"),
- VTName))), VTName));
+ !if (!eq (TypeVariantName, "i"),
+ !if (!eq (Size, 128), "v2i64",
+ !if (!eq (Size, 256), "v4i64",
+ !if (!eq (Size, 512), "v8i64",
+ VTName))), VTName));
PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
@@ -122,6 +122,10 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+ // A vector tye of the same width with element type i64. This is used to
+ // create patterns for logic ops.
+ ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64");
+
// A vector type of the same width with element type i32. This is used to
// create the canonical constant zero node ImmAllZerosV.
ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
@@ -194,7 +198,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
list<dag> ZeroMaskingPattern,
string MaskingConstraint = "",
InstrItinClass itin = NoItinerary,
- bit IsCommutable = 0> {
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0> {
let isCommutable = IsCommutable in
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
@@ -202,7 +207,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
Pattern, itin>;
// Prefer over VMOV*rrk Pat<>
- let AddedComplexity = 20 in
+ let AddedComplexity = 20, isCommutable = IsKCommutable in
def NAME#k: AVX512<O, F, Outs, MaskingIns,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
"$dst {${mask}}, "#IntelSrcAsm#"}",
@@ -210,8 +215,11 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
EVEX_K {
// In case of the 3src subclass this is overridden with a let.
string Constraints = MaskingConstraint;
- }
- let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+ }
+
+ // Zero mask does not add any restrictions to commute operands transformation.
+ // So, it is Ok to use IsCommutable instead of IsKCommutable.
+ let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -231,14 +239,16 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
SDNode Select = vselect,
string MaskingConstraint = "",
InstrItinClass itin = NoItinerary,
- bit IsCommutable = 0> :
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0> :
AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
AttSrcAsm, IntelSrcAsm,
[(set _.RC:$dst, RHS)],
[(set _.RC:$dst, MaskingRHS)],
[(set _.RC:$dst,
(Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
- MaskingConstraint, NoItinerary, IsCommutable>;
+ MaskingConstraint, NoItinerary, IsCommutable,
+ IsKCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -248,13 +258,14 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
string AttSrcAsm, string IntelSrcAsm,
dag RHS,
InstrItinClass itin = NoItinerary,
- bit IsCommutable = 0, SDNode Select = vselect> :
+ bit IsCommutable = 0, bit IsKCommutable = 0,
+ SDNode Select = vselect> :
AVX512_maskable_common<O, F, _, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
- "$src0 = $dst", itin, IsCommutable>;
+ "$src0 = $dst", itin, IsCommutable, IsKCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the scalar instruction.
@@ -278,41 +289,29 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS> :
+ dag RHS, bit IsCommutable = 0,
+ bit IsKCommutable = 0> :
AVX512_maskable_common<O, F, _, Outs,
!con((ins _.RC:$src1), NonTiedIns),
!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
-
-// Similar to AVX512_maskable_3rc but in this case the input VT for the tied
-// operand differs from the output VT. This requires a bitconvert on
-// the preserved vector going into the vselect.
-multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
- X86VectorVTInfo InVT,
- dag Outs, dag NonTiedIns, string OpcodeStr,
- string AttSrcAsm, string IntelSrcAsm,
- dag RHS> :
- AVX512_maskable_common<O, F, OutVT, Outs,
- !con((ins InVT.RC:$src1), NonTiedIns),
- !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
- !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
- OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (vselect InVT.KRCWM:$mask, RHS,
- (bitconvert InVT.RC:$src1))>;
+ (vselect _.KRCWM:$mask, RHS, _.RC:$src1),
+ vselect, "", NoItinerary, IsCommutable, IsKCommutable>;
multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS> :
+ dag RHS, bit IsCommutable = 0,
+ bit IsKCommutable = 0> :
AVX512_maskable_common<O, F, _, Outs,
!con((ins _.RC:$src1), NonTiedIns),
!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
- X86selects>;
+ X86selects, "", NoItinerary, IsCommutable,
+ IsKCommutable>;
multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins,
@@ -334,7 +333,9 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
list<dag> Pattern,
- list<dag> MaskingPattern> {
+ list<dag> MaskingPattern,
+ bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
"$dst, "#IntelSrcAsm#"}",
@@ -351,20 +352,21 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
dag Ins, dag MaskingIns,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, dag MaskingRHS> :
+ dag RHS, dag MaskingRHS,
+ bit IsCommutable = 0> :
AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
AttSrcAsm, IntelSrcAsm,
[(set _.KRC:$dst, RHS)],
- [(set _.KRC:$dst, MaskingRHS)]>;
+ [(set _.KRC:$dst, MaskingRHS)], IsCommutable>;
multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS> :
+ dag RHS, bit IsCommutable = 0> :
AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (and _.KRCWM:$mask, RHS)>;
+ (and _.KRCWM:$mask, RHS), IsCommutable>;
multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
@@ -373,6 +375,27 @@ multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
AttSrcAsm, IntelSrcAsm, [],[]>;
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction. In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskedRHS,
+ InstrItinClass itin = NoItinerary,
+ bit IsCommutable = 0, SDNode Select = vselect> :
+ AVX512_maskable_custom<O, F, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, MaskedRHS,
+ _.ImmAllZerosV))],
+ "$src0 = $dst", itin, IsCommutable>;
+
// Bitcasts between 512-bit vector types. Return the original type since
// no instruction is needed for the conversion.
def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
@@ -420,6 +443,22 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllOnesV))]>;
}
+// Alias instructions that allow VPTERNLOG to be used with a mask to create
+// a mix of all ones and all zeros elements. This is done this way to force
+// the same register to be used as input for all three sources.
+let isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
+ (ins VK16WM:$mask), "",
+ [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask),
+ (v16i32 immAllOnesV),
+ (v16i32 immAllZerosV)))]>;
+def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
+ (ins VK8WM:$mask), "",
+ [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask),
+ (bc_v8i64 (v16i32 immAllOnesV)),
+ (bc_v8i64 (v16i32 immAllZerosV))))]>;
+}
+
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
@@ -428,6 +467,16 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
[(set VR256X:$dst, (v8i32 immAllZerosV))]>;
}
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in {
+ def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
+ [(set FR32X:$dst, fp32imm0)]>;
+ def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
+ [(set FR64X:$dst, fpimm0)]>;
+}
+
//===----------------------------------------------------------------------===//
// AVX-512 - VECTOR INSERT
//
@@ -548,25 +597,28 @@ defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
// vinsertps - insert f32 to XMM
-def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+let ExeDomain = SSEPackedSingle in {
+def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
EVEX_4V;
-def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
(ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR128X:$dst, (X86insertps VR128X:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+}
//===----------------------------------------------------------------------===//
// AVX-512 VECTOR EXTRACT
//---
multiclass vextract_for_size<int Opcode,
- X86VectorVTInfo From, X86VectorVTInfo To,
- PatFrag vextract_extract> {
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ PatFrag vextract_extract,
+ SDNodeXForm EXTRACT_get_vextract_imm> {
let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
// use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
@@ -597,32 +649,23 @@ multiclass vextract_for_size<int Opcode,
[]>, EVEX_K, EVEX;
}
- // Intrinsic call with masking.
- def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x" # To.NumElts # "_" # From.Size)
- From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (vextract_extract:$ext (From.VT From.RC:$src1),
+ (iPTR imm)),
+ To.RC:$src0)),
(!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
From.ZSuffix # "rrk")
- To.RC:$src0,
- (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
- From.RC:$src1, imm:$idx)>;
-
- // Intrinsic call with zero-masking.
- def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x" # To.NumElts # "_" # From.Size)
- From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
- (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
- From.ZSuffix # "rrkz")
- (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
- From.RC:$src1, imm:$idx)>;
+ To.RC:$src0, To.KRCWM:$mask, From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext))>;
- // Intrinsic call without masking.
- def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x" # To.NumElts # "_" # From.Size)
- From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (vextract_extract:$ext (From.VT From.RC:$src1),
+ (iPTR imm)),
+ To.ImmAllZerosV)),
(!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
- From.ZSuffix # "rr")
- From.RC:$src1, imm:$idx)>;
+ From.ZSuffix # "rrkz")
+ To.KRCWM:$mask, From.RC:$src1,
+ (EXTRACT_get_vextract_imm To.RC:$ext))>;
}
// Codegen pattern for the alternative types
@@ -642,39 +685,45 @@ multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
}
multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
- ValueType EltVT64, int Opcode256> {
+ ValueType EltVT64, int Opcode256> {
defm NAME # "32x4Z" : vextract_for_size<Opcode128,
X86VectorVTInfo<16, EltVT32, VR512>,
X86VectorVTInfo< 4, EltVT32, VR128X>,
- vextract128_extract>,
+ vextract128_extract,
+ EXTRACT_get_vextract128_imm>,
EVEX_V512, EVEX_CD8<32, CD8VT4>;
defm NAME # "64x4Z" : vextract_for_size<Opcode256,
X86VectorVTInfo< 8, EltVT64, VR512>,
X86VectorVTInfo< 4, EltVT64, VR256X>,
- vextract256_extract>,
+ vextract256_extract,
+ EXTRACT_get_vextract256_imm>,
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
let Predicates = [HasVLX] in
defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
X86VectorVTInfo< 8, EltVT32, VR256X>,
X86VectorVTInfo< 4, EltVT32, VR128X>,
- vextract128_extract>,
+ vextract128_extract,
+ EXTRACT_get_vextract128_imm>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
let Predicates = [HasVLX, HasDQI] in
defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
- vextract128_extract>,
+ vextract128_extract,
+ EXTRACT_get_vextract128_imm>,
VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
let Predicates = [HasDQI] in {
defm NAME # "64x2Z" : vextract_for_size<Opcode128,
X86VectorVTInfo< 8, EltVT64, VR512>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
- vextract128_extract>,
+ vextract128_extract,
+ EXTRACT_get_vextract128_imm>,
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
defm NAME # "32x8Z" : vextract_for_size<Opcode256,
X86VectorVTInfo<16, EltVT32, VR512>,
X86VectorVTInfo< 8, EltVT32, VR256X>,
- vextract256_extract>,
+ vextract256_extract,
+ EXTRACT_get_vextract256_imm>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
}
}
@@ -986,6 +1035,25 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
AVX5128IBase, EVEX;
}
+let Predicates = [HasVLX, HasBWI] in {
+ // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+ // This means we'll encounter truncated i32 loads; match that here.
+ def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+ (VPBROADCASTWZ256m addr:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast
+ (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+ (VPBROADCASTWZ256m addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST SUBVECTORS
+//
+
defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
v16i32_info, v4i32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT4>;
@@ -999,6 +1067,79 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
v8f64_info, v4f64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT4>;
+let Predicates = [HasAVX512] in {
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
+ (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
+ (VBROADCASTI64X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+ (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8f32 VR256X:$src), 1)>;
+def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
+ (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v4f64 VR256X:$src), 1)>;
+def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v4i64 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8i32 VR256X:$src), 1)>;
+def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v16i16 VR256X:$src), 1)>;
+def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v32i8 VR256X:$src), 1)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTI32X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+ (VINSERTF64x4Zrr
+ (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+ (VINSERTI64x4Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+ (VINSERTI64x4Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+ (VINSERTI64x4Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+}
+
let Predicates = [HasVLX] in {
defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
v8i32x_info, v4i32x_info>,
@@ -1006,7 +1147,28 @@ defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
v8f32x_info, v4f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
+
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+ (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v4f32 VR128X:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v4i32 VR128X:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v8i16 VR128X:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v16i8 VR128X:$src), 1)>;
}
+
let Predicates = [HasVLX, HasDQI] in {
defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
v4i64x_info, v2i64x_info>, VEX_W,
@@ -1014,7 +1176,73 @@ defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
v4f64x_info, v2f64x_info>, VEX_W,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+ (VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2f64 VR128X:$src), 1)>;
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+ (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2i64 VR128X:$src), 1)>;
}
+
+let Predicates = [HasVLX, NoDQI] in {
+def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+ (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2f64 VR128X:$src), 1)>;
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2i64 VR128X:$src), 1)>;
+}
+
+let Predicates = [HasAVX512, NoDQI] in {
+def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI32X4rm addr:$src)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+ (VINSERTF64x4Zrr
+ (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+ (VINSERTI64x4Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
+ (VBROADCASTI64X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+ (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8i32 VR256X:$src), 1)>;
+}
+
let Predicates = [HasDQI] in {
defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
v8i64_info, v2i64x_info>, VEX_W,
@@ -1028,6 +1256,34 @@ defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
v16f32_info, v8f32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+ (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+ (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8i32 VR256X:$src), 1)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+ (VINSERTF32x8Zrr
+ (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+ (VINSERTI32x8Zrr
+ (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1),
+ (EXTRACT_SUBREG
+ (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+ VR128X:$src, sub_xmm),
+ VR128X:$src, 1)), sub_ymm), 1)>;
}
multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
@@ -1049,10 +1305,10 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
EVEX_V128;
}
-defm VPBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
- avx512vl_i32_info, avx512vl_i64_info>;
-defm VPBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
- avx512vl_f32_info, avx512vl_f64_info>;
+defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
+ avx512vl_i32_info, avx512vl_i64_info>;
+defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
+ avx512vl_f32_info, avx512vl_f64_info>;
def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
(VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
@@ -1091,112 +1347,105 @@ defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
//===----------------------------------------------------------------------===//
// -- VPERMI2 - 3 source operands form --
-multiclass avx512_perm_i<bits<8> opc, string OpcodeStr,
- X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
-let Constraints = "$src1 = $dst" in {
- defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst),
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ // The index operand in the pattern should really be an integer type. However,
+ // if we do that and it happens to come from a bitcast, then it becomes
+ // difficult to find the bitcast needed to convert the index to the
+ // destination type for the passthru since it will be folded with the bitcast
+ // of the index operand.
+ defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
+ (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V,
AVX5128IBase;
- defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+ defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2,
- (_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
+ (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
+ (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
EVEX_4V, AVX5128IBase;
}
}
multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
- X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
- let Constraints = "$src1 = $dst" in
- defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
+ X86VectorVTInfo _> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
- (_.VT (X86VPermi2X IdxVT.RC:$src1,
- _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
- AVX5128IBase, EVEX_4V, EVEX_B;
+ (_.VT (X86VPermi2X _.RC:$src1,
+ _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
+ 1>, AVX5128IBase, EVEX_4V, EVEX_B;
}
multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo VTInfo,
- AVX512VLVectorVTInfo ShuffleMask> {
- defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
- ShuffleMask.info512>,
- avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512,
- ShuffleMask.info512>, EVEX_V512;
+ AVX512VLVectorVTInfo VTInfo> {
+ defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
- ShuffleMask.info128>,
- avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128,
- ShuffleMask.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
- ShuffleMask.info256>,
- avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256,
- ShuffleMask.info256>, EVEX_V256;
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>,
+ avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
}
}
multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo VTInfo,
- AVX512VLVectorVTInfo Idx,
Predicate Prd> {
let Predicates = [Prd] in
- defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512,
- Idx.info512>, EVEX_V512;
+ defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
let Predicates = [Prd, HasVLX] in {
- defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128,
- Idx.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256,
- Idx.info256>, EVEX_V256;
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
}
}
defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d",
- avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q",
- avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+ avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w",
- avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
+ avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b",
- avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
+ avx512vl_i8_info, HasVBMI>,
EVEX_CD8<8, CD8VF>;
defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
- avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
- avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+ avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
// VPERMT2
multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
-let Constraints = "$src1 = $dst" in {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V,
- AVX5128IBase;
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
+ EVEX_4V, AVX5128IBase;
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
- (bitconvert (_.LdFrag addr:$src3))))>,
+ (bitconvert (_.LdFrag addr:$src3)))), 1>,
EVEX_4V, AVX5128IBase;
}
}
multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
- let Constraints = "$src1 = $dst" in
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src1,
- IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
- AVX5128IBase, EVEX_4V, EVEX_B;
+ IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
+ 1>, AVX5128IBase, EVEX_4V, EVEX_B;
}
multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
@@ -1252,8 +1501,7 @@ defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
// AVX-512 - BLEND using mask
//
multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
- let hasSideEffects = 0 in
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
@@ -1263,16 +1511,13 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- [(set _.RC:$dst, (vselect _.KRCWM:$mask,
- (_.VT _.RC:$src2),
- (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K;
- let hasSideEffects = 0 in
+ []>, EVEX_4V, EVEX_K;
def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
[]>, EVEX_4V, EVEX_KZ;
- let mayLoad = 1, hasSideEffects = 0 in
+ let mayLoad = 1 in {
def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
@@ -1282,38 +1527,32 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- [(set _.RC:$dst, (vselect _.KRCWM:$mask,
- (_.VT (bitconvert (_.LdFrag addr:$src2))),
- (_.VT _.RC:$src1)))]>,
- EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
- let mayLoad = 1, hasSideEffects = 0 in
+ []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
[]>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
}
+ }
}
multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+ let mayLoad = 1, hasSideEffects = 0 in {
def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
- [(set _.RC:$dst,(vselect _.KRCWM:$mask,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1)))]>,
- EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ []>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
- let mayLoad = 1, hasSideEffects = 0 in
def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
"$dst, $src1, ${src2}", _.BroadcastStr, "}"),
[]>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
-
+ }
}
multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
@@ -1349,21 +1588,6 @@ defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
-let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
- (v8f32 VR256X:$src2))),
- (EXTRACT_SUBREG
- (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
- (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
- (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
- (v8i32 VR256X:$src2))),
- (EXTRACT_SUBREG
- (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-}
//===----------------------------------------------------------------------===//
// Compare Instructions
//===----------------------------------------------------------------------===//
@@ -1421,6 +1645,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
}// let isAsmParserOnly = 1, hasSideEffects = 0
let isCodeGenOnly = 1 in {
+ let isCommutable = 1 in
def rr : AVX512Ii8<0xC2, MRMSrcReg,
(outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
!strconcat("vcmp${cc}", _.Suffix,
@@ -1449,7 +1674,8 @@ let Predicates = [HasAVX512] in {
}
multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _, bit IsCommutable> {
+ let isCommutable = IsCommutable in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -1480,8 +1706,8 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> :
- avx512_icmp_packed<opc, OpcodeStr, OpNode, _> {
+ X86VectorVTInfo _, bit IsCommutable> :
+ avx512_icmp_packed<opc, OpcodeStr, OpNode, _, IsCommutable> {
def rmb : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
@@ -1503,48 +1729,49 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ AVX512VLVectorVTInfo VTInfo, Predicate prd,
+ bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512>,
- EVEX_V512;
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512,
+ IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256>,
- EVEX_V256;
- defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128>,
- EVEX_V128;
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128,
+ IsCommutable>, EVEX_V128;
}
}
multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
SDNode OpNode, AVX512VLVectorVTInfo VTInfo,
- Predicate prd> {
+ Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
- EVEX_V512;
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
+ IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
- EVEX_V256;
- defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
- EVEX_V128;
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
+ IsCommutable>, EVEX_V128;
}
}
defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
- avx512vl_i8_info, HasBWI>,
+ avx512vl_i8_info, HasBWI, 1>,
EVEX_CD8<8, CD8VF>;
defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
- avx512vl_i16_info, HasBWI>,
+ avx512vl_i16_info, HasBWI, 1>,
EVEX_CD8<16, CD8VF>;
defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
- avx512vl_i32_info, HasAVX512>,
+ avx512vl_i32_info, HasAVX512, 1>,
EVEX_CD8<32, CD8VF>;
defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
- avx512vl_i64_info, HasAVX512>,
+ avx512vl_i64_info, HasAVX512, 1>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
@@ -1563,18 +1790,21 @@ defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
avx512vl_i64_info, HasAVX512>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
(COPY_TO_REGCLASS (VPCMPGTDZrr
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
(COPY_TO_REGCLASS (VPCMPEQDZrr
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+}
multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
X86VectorVTInfo _> {
+ let isCommutable = 1 in
def rri : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
!strconcat("vpcmp${cc}", Suffix,
@@ -1740,7 +1970,7 @@ multiclass avx512_vcmp_common<X86VectorVTInfo _> {
"$src2, $src1", "$src1, $src2",
(X86cmpm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- imm:$cc)>;
+ imm:$cc), 1>;
defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
@@ -1824,18 +2054,18 @@ defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
(COPY_TO_REGCLASS (VCMPPSZrri
- (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
- (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
imm:$cc), VK8)>;
def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
(COPY_TO_REGCLASS (VPCMPDZrri
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
imm:$cc), VK8)>;
def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
(COPY_TO_REGCLASS (VPCMPUDZrri
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
imm:$cc), VK8)>;
// ----------------------------------------------------------------
@@ -2011,34 +2241,38 @@ let Predicates = [HasBWI] in {
}
// GR from/to mask register
-let Predicates = [HasDQI] in {
- def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
- (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>;
- def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
- (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>;
- def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
- (KMOVBrk VK8:$src)>;
- def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
- (KMOVBrk VK8:$src)>;
-}
-let Predicates = [HasAVX512] in {
- def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
- (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
- def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
- (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
- def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
- (KMOVWrk VK16:$src)>;
- def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
- (KMOVWrk VK16:$src)>;
-}
-let Predicates = [HasBWI] in {
- def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>;
- def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), (KMOVDrk VK32:$src)>;
-}
-let Predicates = [HasBWI] in {
- def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), (KMOVQkr GR64:$src)>;
- def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), (KMOVQrk VK64:$src)>;
-}
+def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+ (COPY_TO_REGCLASS GR16:$src, VK16)>;
+def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+ (COPY_TO_REGCLASS VK16:$src, GR16)>;
+
+def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+ (COPY_TO_REGCLASS GR8:$src, VK8)>;
+def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+ (COPY_TO_REGCLASS VK8:$src, GR8)>;
+
+def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (KMOVWrk VK16:$src)>;
+def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>;
+
+def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>;
+def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
+def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>;
+
+def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
+ (COPY_TO_REGCLASS GR32:$src, VK32)>;
+def : Pat<(i32 (bitconvert (v32i1 VK32:$src))),
+ (COPY_TO_REGCLASS VK32:$src, GR32)>;
+def : Pat<(v64i1 (bitconvert (i64 GR64:$src))),
+ (COPY_TO_REGCLASS GR64:$src, VK64)>;
+def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
+ (COPY_TO_REGCLASS VK64:$src, GR64)>;
// Load/store kreg
let Predicates = [HasDQI] in {
@@ -2104,65 +2338,58 @@ let Predicates = [HasBWI] in {
(KMOVQkm addr:$src)>;
}
-def assertzext_i1 : PatFrag<(ops node:$src), (assertzext node:$src), [{
- return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
-}]>;
-
let Predicates = [HasAVX512] in {
def : Pat<(i1 (trunc (i64 GR64:$src))),
- (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND64ri8 $src, (i64 1)),
- sub_16bit)), VK1)>;
-
- def : Pat<(i1 (trunc (i64 (assertzext_i1 GR64:$src)))),
- (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
+ (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
+ (i32 1))), VK1)>;
def : Pat<(i1 (trunc (i32 GR32:$src))),
- (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND32ri8 $src, (i32 1)),
- sub_16bit)), VK1)>;
+ (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
- (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>;
+ (COPY_TO_REGCLASS GR32:$src, VK1)>;
def : Pat<(i1 (trunc (i8 GR8:$src))),
- (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), (AND8ri $src, (i8 1)),
- sub_8bit)), VK1)>;
-
- def : Pat<(i1 (trunc (i8 (assertzext_i1 GR8:$src)))),
- (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>;
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit), (i32 1))),
+ VK1)>;
def : Pat<(i1 (trunc (i16 GR16:$src))),
- (COPY_TO_REGCLASS (AND16ri GR16:$src, (i16 1)), VK1)>;
-
- def : Pat<(i1 (trunc (i16 (assertzext_i1 GR16:$src)))),
- (COPY_TO_REGCLASS $src, VK1)>;
+ (COPY_TO_REGCLASS
+ (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit), (i32 1))),
+ VK1)>;
def : Pat<(i32 (zext VK1:$src)),
- (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
- sub_16bit))>;
+ (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
def : Pat<(i32 (anyext VK1:$src)),
- (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
- sub_16bit))>;
+ (COPY_TO_REGCLASS VK1:$src, GR32)>;
def : Pat<(i8 (zext VK1:$src)),
- (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS VK1:$src, GR16)), sub_8bit))>;
+ (EXTRACT_SUBREG
+ (AND32ri8 (KMOVWrk
+ (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
def : Pat<(i8 (anyext VK1:$src)),
- (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS $src, GR16)), sub_8bit))>;
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
def : Pat<(i64 (zext VK1:$src)),
- (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
- sub_16bit))>;
+ (AND64ri8 (SUBREG_TO_REG (i64 0),
+ (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
def : Pat<(i64 (anyext VK1:$src)),
- (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)),
- sub_16bit))>;
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>;
def : Pat<(i16 (zext VK1:$src)),
- (COPY_TO_REGCLASS $src, GR16)>;
+ (EXTRACT_SUBREG
+ (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+ sub_16bit)>;
def : Pat<(i16 (anyext VK1:$src)),
- (i16 (COPY_TO_REGCLASS $src, GR16))>;
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
}
def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
(COPY_TO_REGCLASS VK1:$src, VK16)>;
@@ -2181,34 +2408,12 @@ def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
-// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
-let Predicates = [HasAVX512, NoDQI] in {
- // GR from/to 8-bit mask without native support
- def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
- (COPY_TO_REGCLASS
- (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), VK8)>;
- def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
- (EXTRACT_SUBREG
- (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
- sub_8bit)>;
- def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
- (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16))>;
- def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
- (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16))>;
-}
-
-let Predicates = [HasAVX512] in {
- def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))),
- (COPY_TO_REGCLASS VK16:$src, VK1)>;
- def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))),
- (COPY_TO_REGCLASS VK8:$src, VK1)>;
-}
-let Predicates = [HasBWI] in {
- def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))),
- (COPY_TO_REGCLASS VK32:$src, VK1)>;
- def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))),
- (COPY_TO_REGCLASS VK64:$src, VK1)>;
-}
+def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>;
// Mask unary operation
// - KNOT
@@ -2233,7 +2438,7 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
HasBWI>, VEX, PS, VEX_W;
}
-defm KNOT : avx512_mask_unop_all<0x44, "knot", not>;
+defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;
multiclass avx512_mask_unop_int<string IntName, string InstName> {
let Predicates = [HasAVX512] in
@@ -2244,27 +2449,15 @@ multiclass avx512_mask_unop_int<string IntName, string InstName> {
}
defm : avx512_mask_unop_int<"knot", "KNOT">;
-let Predicates = [HasDQI] in
-def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (KNOTBrr VK8:$src1)>;
-let Predicates = [HasAVX512] in
-def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
-let Predicates = [HasBWI] in
-def : Pat<(xor VK32:$src1, (v32i1 immAllOnesV)), (KNOTDrr VK32:$src1)>;
-let Predicates = [HasBWI] in
-def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>;
-
// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
-let Predicates = [HasAVX512, NoDQI] in {
-def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)),
- (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
-def : Pat<(not VK8:$src),
- (COPY_TO_REGCLASS
- (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
-}
-def : Pat<(xor VK4:$src1, (v4i1 immAllOnesV)),
- (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src1, VK16)), VK4)>;
-def : Pat<(xor VK2:$src1, (v2i1 immAllOnesV)),
- (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src1, VK16)), VK2)>;
+let Predicates = [HasAVX512, NoDQI] in
+def : Pat<(vnot VK8:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+
+def : Pat<(vnot VK4:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>;
+def : Pat<(vnot VK2:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>;
// Mask binary operation
// - KAND, KANDN, KOR, KXNOR, KXOR
@@ -2293,13 +2486,16 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+// These nodes use 'vnot' instead of 'not' to support vectors.
+def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
+def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
-defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>;
-defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>;
-defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>;
-defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>;
-defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>;
-defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>;
+defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>;
+defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, 1>;
+defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
+defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>;
multiclass avx512_mask_binop_int<string IntName, string InstName> {
let Predicates = [HasAVX512] in
@@ -2316,11 +2512,12 @@ defm : avx512_mask_binop_int<"kor", "KOR">;
defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
defm : avx512_mask_binop_int<"kxor", "KXOR">;
-multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
+multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
+ Instruction Inst> {
// With AVX512F, 8-bit mask is promoted to 16-bit mask,
// for the DQI set, this type is legal and KxxxB instruction is used
let Predicates = [NoDQI] in
- def : Pat<(OpNode VK8:$src1, VK8:$src2),
+ def : Pat<(VOpNode VK8:$src1, VK8:$src2),
(COPY_TO_REGCLASS
(Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
(COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
@@ -2330,47 +2527,21 @@ multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
(COPY_TO_REGCLASS (Inst
(COPY_TO_REGCLASS VK1:$src1, VK16),
(COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
- def : Pat<(OpNode VK2:$src1, VK2:$src2),
+ def : Pat<(VOpNode VK2:$src1, VK2:$src2),
(COPY_TO_REGCLASS (Inst
(COPY_TO_REGCLASS VK2:$src1, VK16),
(COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>;
- def : Pat<(OpNode VK4:$src1, VK4:$src2),
+ def : Pat<(VOpNode VK4:$src1, VK4:$src2),
(COPY_TO_REGCLASS (Inst
(COPY_TO_REGCLASS VK4:$src1, VK16),
(COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>;
}
-defm : avx512_binop_pat<and, KANDWrr>;
-defm : avx512_binop_pat<andn, KANDNWrr>;
-defm : avx512_binop_pat<or, KORWrr>;
-defm : avx512_binop_pat<xnor, KXNORWrr>;
-defm : avx512_binop_pat<xor, KXORWrr>;
-
-def : Pat<(xor (xor VK16:$src1, VK16:$src2), (v16i1 immAllOnesV)),
- (KXNORWrr VK16:$src1, VK16:$src2)>;
-def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)),
- (KXNORBrr VK8:$src1, VK8:$src2)>, Requires<[HasDQI]>;
-def : Pat<(xor (xor VK32:$src1, VK32:$src2), (v32i1 immAllOnesV)),
- (KXNORDrr VK32:$src1, VK32:$src2)>, Requires<[HasBWI]>;
-def : Pat<(xor (xor VK64:$src1, VK64:$src2), (v64i1 immAllOnesV)),
- (KXNORQrr VK64:$src1, VK64:$src2)>, Requires<[HasBWI]>;
-
-let Predicates = [NoDQI] in
-def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)),
- (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK8:$src1, VK16),
- (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
-
-def : Pat<(xor (xor VK4:$src1, VK4:$src2), (v4i1 immAllOnesV)),
- (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK4:$src1, VK16),
- (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>;
-
-def : Pat<(xor (xor VK2:$src1, VK2:$src2), (v2i1 immAllOnesV)),
- (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK2:$src1, VK16),
- (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>;
-
-def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)),
- (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
- (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+defm : avx512_binop_pat<and, and, KANDWrr>;
+defm : avx512_binop_pat<vandn, andn, KANDNWrr>;
+defm : avx512_binop_pat<or, or, KORWrr>;
+defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor, xor, KXORWrr>;
// Mask unpacking
multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
@@ -2466,6 +2637,8 @@ defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
let Predicates = [HasAVX512] in {
def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+ def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>;
+ def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>;
def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>;
def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>;
def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>;
@@ -2519,15 +2692,24 @@ def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
(v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
-def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
- (v8i1 (COPY_TO_REGCLASS
- (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16),
- (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>;
-def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))),
- (v4i1 (COPY_TO_REGCLASS
- (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16),
- (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>;
+// Patterns for kmask shift
+multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> {
+ def : Pat<(VT (X86vshli RC:$src, (i8 imm:$imm))),
+ (VT (COPY_TO_REGCLASS
+ (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16),
+ (I8Imm $imm)),
+ RC))>;
+ def : Pat<(VT (X86vsrli RC:$src, (i8 imm:$imm))),
+ (VT (COPY_TO_REGCLASS
+ (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16),
+ (I8Imm $imm)),
+ RC))>;
+}
+
+defm : mask_shift_lowering<VK8, v8i1>, Requires<[HasAVX512, NoDQI]>;
+defm : mask_shift_lowering<VK4, v4i1>, Requires<[HasAVX512]>;
+defm : mask_shift_lowering<VK2, v2i1>, Requires<[HasAVX512]>;
//===----------------------------------------------------------------------===//
// AVX-512 - Aligned and unaligned load and store
//
@@ -2535,7 +2717,6 @@ def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))),
multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
PatFrag ld_frag, PatFrag mload,
- bit IsReMaterializable = 1,
SDPatternOperator SelectOprr = vselect> {
let hasSideEffects = 0 in {
def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
@@ -2545,12 +2726,12 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
(ins _.KRCWM:$mask, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
"${dst} {${mask}} {z}, $src}"),
- [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+ [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
(_.VT _.RC:$src),
_.ImmAllZerosV)))], _.ExeDomain>,
EVEX, EVEX_KZ;
- let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable,
+ let canFoldAsLoad = 1, isReMaterializable = 1,
SchedRW = [WriteLoad] in
def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -2598,37 +2779,32 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _,
- Predicate prd,
- bit IsReMaterializable = 1> {
+ Predicate prd> {
let Predicates = [prd] in
defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag,
- masked_load_aligned512, IsReMaterializable>, EVEX_V512;
+ masked_load_aligned512>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag,
- masked_load_aligned256, IsReMaterializable>, EVEX_V256;
+ masked_load_aligned256>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag,
- masked_load_aligned128, IsReMaterializable>, EVEX_V128;
+ masked_load_aligned128>, EVEX_V128;
}
}
multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _,
Predicate prd,
- bit IsReMaterializable = 1,
SDPatternOperator SelectOprr = vselect> {
let Predicates = [prd] in
defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
- masked_load_unaligned, IsReMaterializable,
- SelectOprr>, EVEX_V512;
+ masked_load_unaligned, SelectOprr>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
- masked_load_unaligned, IsReMaterializable,
- SelectOprr>, EVEX_V256;
+ masked_load_unaligned, SelectOprr>, EVEX_V256;
defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
- masked_load_unaligned, IsReMaterializable,
- SelectOprr>, EVEX_V128;
+ masked_load_unaligned, SelectOprr>, EVEX_V128;
}
}
@@ -2704,11 +2880,11 @@ defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
- 1, null_frag>,
+ null_frag>,
avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
PS, EVEX_CD8<32, CD8VF>;
-defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0,
+defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
null_frag>,
avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
@@ -2732,15 +2908,41 @@ defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
- 1, null_frag>,
+ null_frag>,
avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
- 1, null_frag>,
+ null_frag>,
avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+// Special instructions to help with spilling when we don't have VLX. We need
+// to load or store from a ZMM register instead. These are converted in
+// expandPostRAPseudos.
+let isReMaterializable = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+ "", []>;
+def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+ "", []>;
+def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+ "", []>;
+def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+ "", []>;
+}
+
+let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
+ "", []>;
+def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
+ "", []>;
+def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
+ "", []>;
+def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
+ "", []>;
+}
+
def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
(v8i64 VR512:$src))),
(VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
@@ -2761,6 +2963,52 @@ def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
(v16i32 VR512:$src))),
(VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
+// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
+// available. Use a 512-bit operation and extract.
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+ (v8f32 VR256X:$src0))),
+ (EXTRACT_SUBREG
+ (v16f32
+ (VMOVAPSZrrk
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
+ (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
+ sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+ (v8i32 VR256X:$src0))),
+ (EXTRACT_SUBREG
+ (v16i32
+ (VMOVDQA32Zrrk
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
+ (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
+ sub_ymm)>;
+}
+
+let Predicates = [HasVLX, NoBWI] in {
+ // 128-bit load/store without BWI.
+ def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
+ (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+ def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
+ (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+
+ // 256-bit load/store without BWI.
+ def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst),
+ (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst),
+ (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
+ (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+ def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
+ (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+}
+
let Predicates = [HasVLX] in {
// Special patterns for storing subvector extracts of lower 128-bits of 256.
// Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
@@ -2844,23 +3092,23 @@ let Predicates = [HasVLX] in {
// Special patterns for storing subvector extracts of lower 256-bits of 512.
// Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- def : Pat<(alignedstore (v4f64 (extract_subvector
- (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+ def : Pat<(alignedstore256 (v4f64 (extract_subvector
+ (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
(VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
def : Pat<(alignedstore (v8f32 (extract_subvector
(v16f32 VR512:$src), (iPTR 0))), addr:$dst),
(VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore (v4i64 (extract_subvector
- (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+ def : Pat<(alignedstore256 (v4i64 (extract_subvector
+ (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
(VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore (v8i32 (extract_subvector
- (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+ def : Pat<(alignedstore256 (v8i32 (extract_subvector
+ (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
(VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore (v16i16 (extract_subvector
- (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+ def : Pat<(alignedstore256 (v16i16 (extract_subvector
+ (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
(VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore (v32i8 (extract_subvector
- (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+ def : Pat<(alignedstore256 (v32i8 (extract_subvector
+ (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
(VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
def : Pat<(store (v4f64 (extract_subvector
@@ -2886,6 +3134,7 @@ let Predicates = [HasVLX] in {
// Move Int Doubleword to Packed Double Int
//
+let ExeDomain = SSEPackedInt in {
def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
@@ -2921,10 +3170,11 @@ def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$
IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
EVEX_CD8<64, CD8VT1>;
}
+} // ExeDomain = SSEPackedInt
// Move Int Doubleword to Single Scalar
//
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert GR32:$src))],
@@ -2934,10 +3184,11 @@ def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$sr
"vmovd\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move doubleword from xmm register to r/m32
//
+let ExeDomain = SSEPackedInt in {
def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
@@ -2949,9 +3200,11 @@ def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
[(store (i32 (extractelt (v4i32 VR128X:$src),
(iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt
// Move quadword from xmm1 register to r/m64
//
+let ExeDomain = SSEPackedInt in {
def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
@@ -2978,10 +3231,11 @@ def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src),
"vmovq.s\t{$src, $dst|$dst, $src}",[]>,
EVEX, VEX_W;
+} // ExeDomain = SSEPackedInt
// Move Scalar Single to Double Int
//
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
(ins FR32X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
@@ -2992,54 +3246,71 @@ def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
"vmovd\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move Quadword Int to Packed Quadword Int
//
+let ExeDomain = SSEPackedInt in {
def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
(ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+} // ExeDomain = SSEPackedInt
//===----------------------------------------------------------------------===//
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
-multiclass avx512_move_scalar <string asm, SDNode OpNode,
+multiclass avx512_move_scalar<string asm, SDNode OpNode,
X86VectorVTInfo _> {
- defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2),
- asm, "$src2, $src1","$src1, $src2",
- (_.VT (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2))),
- IIC_SSE_MOV_S_RR>, EVEX_4V;
- let Constraints = "$src1 = $dst" in
- defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _,
- (outs _.RC:$dst),
- (ins _.ScalarMemOp:$src),
- asm,"$src","$src",
- (_.VT (OpNode (_.VT _.RC:$src1),
- (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src)))))>, EVEX;
- let isCodeGenOnly = 1 in {
- def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
- (ins _.RC:$src1, _.FRC:$src2),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
- (scalar_to_vector _.FRC:$src2))))],
- _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
- def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
- _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+ def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.FRC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+ (scalar_to_vector _.FRC:$src2))))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+ def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
+ "$dst {${mask}} {z}, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ _.ImmAllZerosV)))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ;
+ let Constraints = "$src0 = $dst" in
+ def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+ !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|",
+ "$dst {${mask}}, $src1, $src2}"),
+ [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ (_.VT _.RC:$src0))))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K;
+ let canFoldAsLoad = 1, isReMaterializable = 1 in
+ def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+ _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+ let mayLoad = 1, hasSideEffects = 0 in {
+ let Constraints = "$src0 = $dst" in
+ def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}}|",
+ "$dst {${mask}}, $src}"),
+ [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K;
+ def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
+ (ins _.KRCWM:$mask, _.ScalarMemOp:$src),
+ !strconcat(asm, "\t{$src, $dst {${mask}} {z}|",
+ "$dst {${mask}} {z}, $src}"),
+ [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ;
}
def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>,
EVEX;
- let mayStore = 1 in
+ let mayStore = 1, hasSideEffects = 0 in
def mrk: AVX512PI<0x11, MRMDestMem, (outs),
(ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
!strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
@@ -3052,12 +3323,99 @@ defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
+ PatLeaf ZeroFP, X86VectorVTInfo _> {
+
+def : Pat<(_.VT (OpNode _.RC:$src0,
+ (_.VT (scalar_to_vector
+ (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+ (_.EltVT _.FRC:$src1),
+ (_.EltVT _.FRC:$src2))))))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
+ (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
+ (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+ (_.VT _.RC:$src0),
+ (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+ _.RC)>;
+
+def : Pat<(_.VT (OpNode _.RC:$src0,
+ (_.VT (scalar_to_vector
+ (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+ (_.EltVT _.FRC:$src1),
+ (_.EltVT ZeroFP))))))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
+ (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+ (_.VT _.RC:$src0),
+ (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+ _.RC)>;
+
+}
+
+multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
+ dag Mask, RegisterClass MaskRC> {
+
+def : Pat<(masked_store addr:$dst, Mask,
+ (_.info512.VT (insert_subvector undef,
+ (_.info256.VT (insert_subvector undef,
+ (_.info128.VT _.info128.RC:$src),
+ (i64 0))),
+ (i64 0)))),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+}
+
+multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
+ dag Mask, RegisterClass MaskRC> {
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask,
+ (_.info512.VT (bitconvert
+ (v16i32 immAllZerosV))))),
+ (i64 0))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask,
+ (_.info512.VT (insert_subvector undef,
+ (_.info256.VT (insert_subvector undef,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+ (i64 0))),
+ (i64 0))))),
+ (i64 0))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+ addr:$srcAddr)>;
+
+}
+
+defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
+defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
+
+defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
+defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
+defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+
+defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
+defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
+defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+
def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
- (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+ (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
- (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+ (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
@@ -3088,6 +3446,7 @@ let Predicates = [HasAVX512] in {
(VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
(VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+ }
// Move low f32 and clear high bits.
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
@@ -3097,8 +3456,15 @@ let Predicates = [HasAVX512] in {
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
(SUBREG_TO_REG (i32 0),
(VMOVSSZrr (v4i32 (V_SET0)),
- (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
- }
+ (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (v4f32 (V_SET0)),
+ (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSSZrr (v4i32 (V_SET0)),
+ (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;
let AddedComplexity = 20 in {
// MOVSSrm zeros the high parts of the register; represent this
@@ -3109,6 +3475,8 @@ let Predicates = [HasAVX512] in {
(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
@@ -3131,6 +3499,8 @@ let Predicates = [HasAVX512] in {
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
@@ -3145,6 +3515,8 @@ let Predicates = [HasAVX512] in {
def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+ def : Pat<(v16f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
@@ -3168,10 +3540,17 @@ let Predicates = [HasAVX512] in {
(SUBREG_TO_REG (i32 0),
(VMOVSDZrr (v2f64 (V_SET0)),
(EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (v2f64 (V_SET0)),
+ (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
(SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
(EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+ (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
// Extract and store.
def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
@@ -3238,15 +3617,6 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
(v2i64 VR128X:$src))))],
IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
-let AddedComplexity = 20 , isCodeGenOnly = 1 in
-def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
- (ins i128mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128X:$dst, (v2i64 (X86vzmovl
- (loadv2i64 addr:$src))))],
- IIC_SSE_MOVDQ>, EVEX, VEX_W,
- EVEX_CD8<8, CD8VT8>;
-
let Predicates = [HasAVX512] in {
let AddedComplexity = 15 in {
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
@@ -3258,34 +3628,46 @@ let Predicates = [HasAVX512] in {
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
(SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+
+ def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
}
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
let AddedComplexity = 20 in {
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
(VMOVDI2PDIZrm addr:$src)>;
-
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
(VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
(VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIZrm addr:$src)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (VMOVZPQILo2PQIZrm addr:$src)>;
+ (VMOVQI2PQIZrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
- (VMOVZPQILo2PQIZrr VR128X:$src)>;
+ (VMOVZPQILo2PQIZrr VR128X:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
- (VMOVZPQILo2PQIZrm addr:$src)>;
+ (VMOVQI2PQIZrm addr:$src)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
}
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+ def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
+ def : Pat<(v16i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
def : Pat<(v8i64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
}
def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
@@ -3366,11 +3748,11 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in {
(VMOVNTDQAZrm addr:$src)>;
def : Pat<(v8i64 (alignednontemporalload addr:$src)),
(VMOVNTDQAZrm addr:$src)>;
- def : Pat<(v16i32 (alignednontemporalload addr:$src)),
+ def : Pat<(v16i32 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZrm addr:$src)>;
- def : Pat<(v32i16 (alignednontemporalload addr:$src)),
+ def : Pat<(v32i16 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZrm addr:$src)>;
- def : Pat<(v64i8 (alignednontemporalload addr:$src)),
+ def : Pat<(v64i8 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZrm addr:$src)>;
}
@@ -3388,11 +3770,11 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
(VMOVNTDQAZ256rm addr:$src)>;
def : Pat<(v4i64 (alignednontemporalload addr:$src)),
(VMOVNTDQAZ256rm addr:$src)>;
- def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+ def : Pat<(v8i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZ256rm addr:$src)>;
- def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+ def : Pat<(v16i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZ256rm addr:$src)>;
- def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+ def : Pat<(v32i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZ256rm addr:$src)>;
def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
@@ -3408,11 +3790,11 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
(VMOVNTDQAZ128rm addr:$src)>;
def : Pat<(v2i64 (alignednontemporalload addr:$src)),
(VMOVNTDQAZ128rm addr:$src)>;
- def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+ def : Pat<(v4i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZ128rm addr:$src)>;
- def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+ def : Pat<(v8i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZ128rm addr:$src)>;
- def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+ def : Pat<(v16i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
(VMOVNTDQAZ128rm addr:$src)>;
}
@@ -3563,10 +3945,10 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
AVX512BIBase, EVEX_4V;
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
- (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2),
+ (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
OpcodeStr,
"${src2}"##_Brdct.BroadcastStr##", $src1",
- "$src1, ${src2}"##_Dst.BroadcastStr,
+ "$src1, ${src2}"##_Brdct.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Brdct.VT (X86VBroadcast
(_Brdct.ScalarLdFrag addr:$src2)))))),
@@ -3646,13 +4028,14 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
SDNode OpNode,X86VectorVTInfo _Src,
- X86VectorVTInfo _Dst> {
+ X86VectorVTInfo _Dst, bit IsCommutable = 0> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
"$src2, $src1","$src1, $src2",
(_Dst.VT (OpNode
(_Src.VT _Src.RC:$src1),
- (_Src.VT _Src.RC:$src2)))>,
+ (_Src.VT _Src.RC:$src2))),
+ NoItinerary, IsCommutable>,
EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
@@ -3695,15 +4078,15 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
SDNode OpNode, AVX512VLVectorVTInfo _Src,
- AVX512VLVectorVTInfo _Dst> {
+ AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> {
let Predicates = [HasBWI] in
defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
- _Dst.info512>, EVEX_V512;
+ _Dst.info512, IsCommutable>, EVEX_V512;
let Predicates = [HasBWI, HasVLX] in {
defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
- _Dst.info256>, EVEX_V256;
+ _Dst.info256, IsCommutable>, EVEX_V256;
defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
- _Dst.info128>, EVEX_V128;
+ _Dst.info128, IsCommutable>, EVEX_V128;
}
}
@@ -3715,7 +4098,7 @@ defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512B
defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
- avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase;
+ avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase;
defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
@@ -3744,17 +4127,119 @@ defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasDQI, NoVLX] in {
+ def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+
+ def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+}
+
//===----------------------------------------------------------------------===//
// AVX-512 Logical Instructions
//===----------------------------------------------------------------------===//
-defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> {
+ defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
+ (bitconvert (_.VT _.RC:$src2)))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ _.RC:$src2)))),
+ itins.rr, IsCommutable>,
+ AVX512BIBase, EVEX_4V;
+
+ defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+ "$src2, $src1", "$src1, $src2",
+ (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
+ (bitconvert (_.LdFrag addr:$src2)))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.LdFrag addr:$src2)))))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable = 0> :
+ avx512_logic_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+ defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))),
+ (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert
+ (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))))),
+ itins.rm>,
+ AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+ Predicate prd, bit IsCommutable = 0> {
+ let Predicates = [prd] in
+ defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+ IsCommutable>, EVEX_V512;
+
+ let Predicates = [prd, HasVLX] in {
+ defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+ IsCommutable>, EVEX_V128;
+ }
+}
+
+multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ itins, prd, IsCommutable>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+ SDNode OpNode, OpndItins itins, Predicate prd,
+ bit IsCommutable = 0> {
+ defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+ IsCommutable>;
+
+ defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+ IsCommutable>;
+}
+
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
SSE_INTALU_ITINS_P, HasAVX512, 0>;
//===----------------------------------------------------------------------===//
@@ -3763,13 +4248,13 @@ defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode VecNode, OpndItins itins,
bit IsCommutable> {
-
+ let ExeDomain = _.ExeDomain in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
(i32 FROUND_CURRENT)),
- itins.rr, IsCommutable>;
+ itins.rr>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
@@ -3777,25 +4262,27 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(VecNode (_.VT _.RC:$src1),
(_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
(i32 FROUND_CURRENT)),
- itins.rm, IsCommutable>;
- let isCodeGenOnly = 1, isCommutable = IsCommutable,
- Predicates = [HasAVX512] in {
+ itins.rm>;
+ let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
- itins.rr>;
+ itins.rr> {
+ let isCommutable = IsCommutable;
+ }
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2)))], itins.rm>;
}
+ }
}
multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
-
+ let ExeDomain = _.ExeDomain in
defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -3805,7 +4292,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode VecNode, OpndItins itins, bit IsCommutable> {
-
+ let ExeDomain = _.ExeDomain in
defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
@@ -3843,9 +4330,9 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_MUL_ITINS_S, 1>;
defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_DIV_ITINS_S, 0>;
defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>;
defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>;
@@ -3853,12 +4340,14 @@ defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITIN
// X86fminc and X86fmaxc instead of X86fmin and X86fmax
multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
- let isCodeGenOnly = 1, isCommutable =1, Predicates = [HasAVX512] in {
+ let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
- itins.rr>;
+ itins.rr> {
+ let isCommutable = 1;
+ }
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -3882,27 +4371,35 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
EVEX_CD8<64, CD8VT1>;
-multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, bit IsCommutable> {
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ X86VectorVTInfo _, OpndItins itins,
+ bit IsCommutable> {
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V;
- defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
- "$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V;
- defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
- (OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))>,
- EVEX_4V, EVEX_B;
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr,
+ IsCommutable>, EVEX_4V;
+ let mayLoad = 1 in {
+ defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ "$src2, $src1", "$src1, $src2",
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>,
+ EVEX_4V;
+ defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ "${src2}"##_.BroadcastStr##", $src1",
+ "$src1, ${src2}"##_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))),
+ itins.rm>, EVEX_4V, EVEX_B;
+ }
+ }
}
-multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
- X86VectorVTInfo _> {
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
"$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -3911,8 +4408,9 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRn
}
-multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
- X86VectorVTInfo _> {
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
@@ -3920,30 +4418,31 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
EVEX_4V, EVEX_B;
}
-multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
- Predicate prd, bit IsCommutable = 0> {
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ Predicate prd, SizeItins itins,
+ bit IsCommutable = 0> {
let Predicates = [prd] in {
defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
- IsCommutable>, EVEX_V512, PS,
+ itins.s, IsCommutable>, EVEX_V512, PS,
EVEX_CD8<32, CD8VF>;
defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
- IsCommutable>, EVEX_V512, PD, VEX_W,
+ itins.d, IsCommutable>, EVEX_V512, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
}
// Define only if AVX512VL feature is present.
let Predicates = [prd, HasVLX] in {
defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
- IsCommutable>, EVEX_V128, PS,
+ itins.s, IsCommutable>, EVEX_V128, PS,
EVEX_CD8<32, CD8VF>;
defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
- IsCommutable>, EVEX_V256, PS,
+ itins.s, IsCommutable>, EVEX_V256, PS,
EVEX_CD8<32, CD8VF>;
defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
- IsCommutable>, EVEX_V128, PD, VEX_W,
+ itins.d, IsCommutable>, EVEX_V128, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
- IsCommutable>, EVEX_V256, PD, VEX_W,
+ itins.d, IsCommutable>, EVEX_V256, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
}
}
@@ -3962,26 +4461,140 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
-defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512, 1>,
+defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512,
+ SSE_ALU_ITINS_P, 1>,
avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
-defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512, 1>,
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512,
+ SSE_MUL_ITINS_P, 1>,
avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512>,
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>,
avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
-defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512>,
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>,
avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
-defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, 0>,
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
+ SSE_ALU_ITINS_P, 0>,
avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
-defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, 0>,
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
+ SSE_ALU_ITINS_P, 0>,
avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
let isCodeGenOnly = 1 in {
- defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, 1>;
- defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512, 1>;
+ defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
+ SSE_ALU_ITINS_P, 1>;
+ defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
+ SSE_ALU_ITINS_P, 1>;
+}
+defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
+ SSE_ALU_ITINS_P, 1>;
+defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
+ SSE_ALU_ITINS_P, 0>;
+defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
+ SSE_ALU_ITINS_P, 1>;
+defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
+ SSE_ALU_ITINS_P, 1>;
+
+// Patterns catch floating point selects with bitcasted integer logic ops.
+multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode,
+ X86VectorVTInfo _, Predicate prd> {
+let Predicates = [prd] in {
+ // Masked register-register logical operations.
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, _.RC:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
+ _.RC:$src2)>;
+ // Masked register-memory logical operations.
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (load addr:$src2)))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2)>;
+ // Register-broadcast logical operations.
+ def : Pat<(_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.VT (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)))))),
+ (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert
+ (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.VT
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (bitconvert
+ (_.i64VT (OpNode _.RC:$src1,
+ (bitconvert (_.VT
+ (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2))))))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2)>;
+}
+}
+
+multiclass avx512_fp_logical_lowering_sizes<string InstrStr, SDNode OpNode> {
+ defm : avx512_fp_logical_lowering<InstrStr#DZ128, OpNode, v4f32x_info, HasVLX>;
+ defm : avx512_fp_logical_lowering<InstrStr#QZ128, OpNode, v2f64x_info, HasVLX>;
+ defm : avx512_fp_logical_lowering<InstrStr#DZ256, OpNode, v8f32x_info, HasVLX>;
+ defm : avx512_fp_logical_lowering<InstrStr#QZ256, OpNode, v4f64x_info, HasVLX>;
+ defm : avx512_fp_logical_lowering<InstrStr#DZ, OpNode, v16f32_info, HasAVX512>;
+ defm : avx512_fp_logical_lowering<InstrStr#QZ, OpNode, v8f64_info, HasAVX512>;
+}
+
+defm : avx512_fp_logical_lowering_sizes<"VPAND", and>;
+defm : avx512_fp_logical_lowering_sizes<"VPOR", or>;
+defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>;
+defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>;
+
+let Predicates = [HasVLX,HasDQI] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
+ (COPY_TO_REGCLASS (VANDPDZ128rr
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
+ (COPY_TO_REGCLASS (VORPDZ128rr
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
+ (COPY_TO_REGCLASS (VXORPDZ128rr
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+ def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
+ (COPY_TO_REGCLASS (VANDNPDZ128rr
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+
+ def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
+ (COPY_TO_REGCLASS (VANDPSZ128rr
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
+ (COPY_TO_REGCLASS (VORPSZ128rr
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
+ (COPY_TO_REGCLASS (VXORPSZ128rr
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+ def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
+ (COPY_TO_REGCLASS (VANDNPSZ128rr
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
}
-defm VAND : avx512_fp_binop_p<0x54, "vand", X86fand, HasDQI, 1>;
-defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, HasDQI, 0>;
-defm VOR : avx512_fp_binop_p<0x56, "vor", X86for, HasDQI, 1>;
-defm VXOR : avx512_fp_binop_p<0x57, "vxor", X86fxor, HasDQI, 1>;
multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _> {
@@ -4157,6 +4770,7 @@ defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8X
//===----------------------------------------------------------------------===//
multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
(ins _.RC:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
@@ -4168,10 +4782,12 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
(_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
(i8 imm:$src2))),
SSE_INTSHIFT_ITINS_P.rm>;
+ }
}
multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
@@ -4182,6 +4798,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
// src2 is always 128-bit
+ let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, VR128X:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
@@ -4193,6 +4810,7 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
EVEX_4V;
+ }
}
multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4286,6 +4904,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
//===-------------------------------------------------------------------===//
multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
@@ -4298,10 +4917,12 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT (bitconvert (_.LdFrag addr:$src2))))),
SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
EVEX_CD8<_.EltSize, CD8VF>;
+ }
}
multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
@@ -4375,9 +4996,6 @@ defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
avx512_var_shift_w<0x11, "vpsravw", sra>,
avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
-let isCodeGenOnly = 1 in
- defm VPSRAV_Int : avx512_var_shift_types<0x46, "vpsrav", X86vsrav>,
- avx512_var_shift_w<0x11, "vpsravw", X86vsrav>;
defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
avx512_var_shift_w<0x10, "vpsrlvw", srl>,
@@ -4385,6 +5003,76 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
+// Special handing for handling VPSRAV intrinsics.
+multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
+ list<Predicate> p> {
+ let Predicates = p in {
+ def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
+ _.RC:$src2)>;
+ def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
+ _.RC:$src1, addr:$src2)>;
+ let AddedComplexity = 20 in {
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
+ _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
+ _.KRC:$mask, _.RC:$src1, addr:$src2)>;
+ }
+ let AddedComplexity = 30 in {
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
+ _.RC:$src1, _.RC:$src2)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
+ _.RC:$src1, addr:$src2)>;
+ }
+ }
+}
+
+multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
+ list<Predicate> p> :
+ avx512_var_shift_int_lowering<InstrStr, _, p> {
+ let Predicates = p in {
+ def : Pat<(_.VT (X86vsrav _.RC:$src1,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
+ _.RC:$src1, addr:$src2)>;
+ let AddedComplexity = 20 in
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
+ _.RC:$src0)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
+ _.KRC:$mask, _.RC:$src1, addr:$src2)>;
+ let AddedComplexity = 30 in
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (X86vsrav _.RC:$src1,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask,
+ _.RC:$src1, addr:$src2)>;
+ }
+}
+
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>;
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>;
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
+
//===-------------------------------------------------------------------===//
// 1-src variable permutation VPERMW/D/Q
//===-------------------------------------------------------------------===//
@@ -4501,8 +5189,10 @@ multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
+let ExeDomain = SSEPackedSingle in
defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
avx512vl_i32_info>;
+let ExeDomain = SSEPackedDouble in
defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
avx512vl_i64_info>, VEX_W;
//===----------------------------------------------------------------------===//
@@ -4666,61 +5356,71 @@ let Predicates = [HasAVX512] in {
// FMA - Fused Multiply Operations
//
-let Constraints = "$src1 = $dst" in {
multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
AVX512FMA3Base;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
AVX512FMA3Base;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
- (OpNode _.RC:$src1,
- _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+ (OpNode _.RC:$src2,
+ _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
AVX512FMA3Base, EVEX_B;
+ }
+
+ // Additional pattern for folding broadcast nodes in other orders.
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode _.RC:$src1, _.RC:$src2,
+ (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
}
multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>,
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC;
}
-} // Constraints = "$src1 = $dst"
multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+ string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512>,
- avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
- EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+ avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+ Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256>,
+ defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128>,
+ defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd > {
+ SDNode OpNodeRnd > {
defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- avx512vl_f32_info>;
+ avx512vl_f32_info, "PS">;
defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- avx512vl_f64_info>, VEX_W;
+ avx512vl_f64_info, "PD">, VEX_W;
}
defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
@@ -4731,19 +5431,19 @@ defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddR
defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;
-let Constraints = "$src1 = $dst" in {
multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>,
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
AVX512FMA3Base;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
AVX512FMA3Base;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4752,40 +5452,60 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src2, ${src3}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src2,
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- _.RC:$src1))>, AVX512FMA3Base, EVEX_B;
+ _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B;
+ }
+
+ // Additional patterns for folding broadcast nodes in other orders.
+ def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+ _.RC:$src2, _.RC:$src1)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mb) _.RC:$src1,
+ _.RC:$src2, addr:$src3)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+ _.RC:$src2, _.RC:$src1),
+ _.RC:$src1)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+ _.RC:$src2, _.RC:$src1),
+ _.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbkz) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
}
multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>,
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC;
}
-} // Constraints = "$src1 = $dst"
multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+ string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512>,
- avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
- EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+ avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+ Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256>,
+ defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128>,
+ defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd > {
+ SDNode OpNodeRnd > {
defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- avx512vl_f32_info>;
+ avx512vl_f32_info, "PS">;
defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- avx512vl_f64_info>, VEX_W;
+ avx512vl_f64_info, "PD">, VEX_W;
}
defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
@@ -4795,61 +5515,71 @@ defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86Fms
defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
-let Constraints = "$src1 = $dst" in {
multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src3, _.RC:$src2),
- OpcodeStr, "$src2, $src3", "$src3, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+ (ins _.RC:$src2, _.RC:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
AVX512FMA3Base;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src3, _.MemOp:$src2),
- OpcodeStr, "$src2, $src3", "$src3, $src2",
- (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3))>,
+ (ins _.RC:$src2, _.MemOp:$src3),
+ OpcodeStr, "$src3, $src2", "$src2, $src3",
+ (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src3), _.RC:$src2)), 1, 0>,
AVX512FMA3Base;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src3, _.ScalarMemOp:$src2),
- OpcodeStr, "${src2}"##_.BroadcastStr##", $src3",
- "$src3, ${src2}"##_.BroadcastStr,
+ (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
+ "$src2, ${src3}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src1,
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- _.RC:$src3))>, AVX512FMA3Base, EVEX_B;
+ (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B;
+ }
+
+ // Additional patterns for folding broadcast nodes in other orders.
+ def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+ _.RC:$src1, _.RC:$src2),
+ _.RC:$src1)),
+ (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+ _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
}
multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ X86VectorVTInfo _, string Suff> {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src3, _.RC:$src2, AVX512RC:$rc),
- OpcodeStr, "$rc, $src2, $src3", "$src3, $src2, $rc",
- (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>,
+ (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC;
}
-} // Constraints = "$src1 = $dst"
multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, AVX512VLVectorVTInfo _> {
+ SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+ string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512>,
- avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512>,
- EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+ defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+ avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+ Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256>,
+ defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128>,
+ defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd > {
+ SDNode OpNodeRnd > {
defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- avx512vl_f32_info>;
+ avx512vl_f32_info, "PS">;
defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- avx512vl_f64_info>, VEX_W;
+ avx512vl_f64_info, "PD">, VEX_W;
}
defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
@@ -4866,18 +5596,18 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
dag RHS_r, dag RHS_m > {
defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3), OpcodeStr,
- "$src3, $src2", "$src2, $src3", RHS_VEC_r>, AVX512FMA3Base;
+ "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base;
defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr,
- "$src3, $src2", "$src2, $src3", RHS_VEC_m>, AVX512FMA3Base;
+ "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base;
defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
- OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb>,
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, 1, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC;
- let isCodeGenOnly = 1 in {
+ let isCodeGenOnly = 1, isCommutable = 1 in {
def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
!strconcat(OpcodeStr,
@@ -4893,38 +5623,40 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
}// Constraints = "$src1 = $dst"
multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ ,
- string SUFF> {
-
- defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
- (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
+
+ defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
+ // Operands for intrinsic are in 123 order to preserve passthu
+ // semantics.
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
- (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3,
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
_.FRC:$src3))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
(_.ScalarLdFrag addr:$src3))))>;
- defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
- (_.VT (OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnd _.RC:$src2,
+ defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
+ (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds3 _.RC:$src2,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
_.RC:$src1, (i32 FROUND_CURRENT))),
- (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1,
+ (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
_.FRC:$src1))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
(_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
- defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnd _.RC:$src1,
+ defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodeRnds1 _.RC:$src1,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
_.RC:$src2, (i32 FROUND_CURRENT))),
- (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2,
+ (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
_.FRC:$src2))),
@@ -4933,21 +5665,26 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
}
multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{
+ string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ SDNode OpNodeRnds3> {
let Predicates = [HasAVX512] in {
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnd, f32x_info, "SS">,
- EVEX_CD8<32, CD8VT1>, VEX_LIG;
+ OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+ EVEX_CD8<32, CD8VT1>, VEX_LIG;
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnd, f64x_info, "SD">,
- EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+ OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+ EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>;
-defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
+ X86FmaddRnds3>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
+ X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
+ X86FnmaddRnds1, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
+ X86FnmsubRnds1, X86FnmsubRnds3>;
//===----------------------------------------------------------------------===//
// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
@@ -5067,6 +5804,11 @@ defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -5098,6 +5840,11 @@ defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+
def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
(VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
@@ -5170,106 +5917,158 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
// Therefore, the SSE intrinsics are mapped to the AVX512 instructions.
let Predicates = [HasAVX512] in {
def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
- (VCVTSS2SIZrr (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ (VCVTSS2SIZrr VR128X:$src)>;
+ def : Pat<(i32 (int_x86_sse_cvtss2si (sse_load_f32 addr:$src))),
+ (VCVTSS2SIZrm addr:$src)>;
def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
- (VCVTSS2SI64Zrr (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ (VCVTSS2SI64Zrr VR128X:$src)>;
+ def : Pat<(i64 (int_x86_sse_cvtss2si64 (sse_load_f32 addr:$src))),
+ (VCVTSS2SI64Zrm addr:$src)>;
def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
- (VCVTSD2SIZrr (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+ (VCVTSD2SIZrr VR128X:$src)>;
+ def : Pat<(i32 (int_x86_sse2_cvtsd2si (sse_load_f64 addr:$src))),
+ (VCVTSD2SIZrm addr:$src)>;
def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
- (VCVTSD2SI64Zrr (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+ (VCVTSD2SI64Zrr VR128X:$src)>;
+ def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (sse_load_f64 addr:$src))),
+ (VCVTSD2SI64Zrm addr:$src)>;
} // HasAVX512
-let isCodeGenOnly = 1 , Predicates = [HasAVX512] in {
- defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
- int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
- SSE_CVT_Scalar, 0>, XS, EVEX_4V;
- defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
- int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
- SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
- defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
- int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
- SSE_CVT_Scalar, 0>, XD, EVEX_4V;
- defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
- int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
- SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
-
- defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x7B, GR32, VR128X,
- int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}",
- SSE_CVT_Scalar, 0>, XD, EVEX_4V;
-} // isCodeGenOnly = 1, Predicates = [HasAVX512]
+let Predicates = [HasAVX512] in {
+ def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, GR32:$src2),
+ (VCVTSI2SSZrr_Int VR128X:$src1, GR32:$src2)>;
+ def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, (loadi32 addr:$src2)),
+ (VCVTSI2SSZrm_Int VR128X:$src1, addr:$src2)>;
+ def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, GR64:$src2),
+ (VCVTSI642SSZrr_Int VR128X:$src1, GR64:$src2)>;
+ def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, (loadi64 addr:$src2)),
+ (VCVTSI642SSZrm_Int VR128X:$src1, addr:$src2)>;
+ def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, GR32:$src2),
+ (VCVTSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
+ def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, (loadi32 addr:$src2)),
+ (VCVTSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
+ def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, GR64:$src2),
+ (VCVTSI642SDZrr_Int VR128X:$src1, GR64:$src2)>;
+ def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, (loadi64 addr:$src2)),
+ (VCVTSI642SDZrm_Int VR128X:$src1, addr:$src2)>;
+ def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, GR32:$src2),
+ (VCVTUSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
+ def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, (loadi32 addr:$src2)),
+ (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
+} // Predicates = [HasAVX512]
+
+// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
+// which produce unnecessary vmovs{s,d} instructions
+let Predicates = [HasAVX512] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+} // Predicates = [HasAVX512]
// Convert float/double to signed/unsigned int 32/64 with truncation
multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
X86VectorVTInfo _DstRC, SDNode OpNode,
- SDNode OpNodeRnd>{
+ SDNode OpNodeRnd, string aliasStr>{
let Predicates = [HasAVX512] in {
- def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
- def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ let hasSideEffects = 0 in
+ def rb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
[]>, EVEX, EVEX_B;
- def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
+ def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
EVEX;
+ def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
+ def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst|$dst, $src, {sae}}",
+ (!cast<Instruction>(NAME # "rb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
+ def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst,
+ _SrcRC.ScalarMemOp:$src), 0>;
+
let isCodeGenOnly = 1 in {
- def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
- !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
- (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
- def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
- !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
- [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
- (i32 FROUND_NO_EXC)))]>,
- EVEX,VEX_LIG , EVEX_B;
- let mayLoad = 1, hasSideEffects = 0 in
- def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
- (ins _SrcRC.MemOp:$src),
- !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- []>, EVEX, VEX_LIG;
+ def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+ (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
+ def rb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+ [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+ (i32 FROUND_NO_EXC)))]>,
+ EVEX,VEX_LIG , EVEX_B;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
+ (ins _SrcRC.MemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ []>, EVEX, VEX_LIG;
} // isCodeGenOnly = 1
} //HasAVX512
}
-defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info,
- fp_to_sint,X86cvtts2IntRnd>,
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
+ fp_to_sint, X86cvtts2IntRnd, "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info,
- fp_to_sint,X86cvtts2IntRnd>,
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
+ fp_to_sint, X86cvtts2IntRnd, "{q}">,
VEX_W, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info,
- fp_to_sint,X86cvtts2IntRnd>,
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
+ fp_to_sint, X86cvtts2IntRnd, "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info,
- fp_to_sint,X86cvtts2IntRnd>,
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
+ fp_to_sint, X86cvtts2IntRnd, "{q}">,
VEX_W, XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info,
- fp_to_uint,X86cvtts2UIntRnd>,
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
+ fp_to_uint, X86cvtts2UIntRnd, "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info,
- fp_to_uint,X86cvtts2UIntRnd>,
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
+ fp_to_uint, X86cvtts2UIntRnd, "{q}">,
XS,VEX_W, EVEX_CD8<32, CD8VT1>;
-defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info,
- fp_to_uint,X86cvtts2UIntRnd>,
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
+ fp_to_uint, X86cvtts2UIntRnd, "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
-defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info,
- fp_to_uint,X86cvtts2UIntRnd>,
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
+ fp_to_uint, X86cvtts2UIntRnd, "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
let Predicates = [HasAVX512] in {
def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
- (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ (VCVTTSS2SIZrr_Int VR128X:$src)>;
+ def : Pat<(i32 (int_x86_sse_cvttss2si (sse_load_f32 addr:$src))),
+ (VCVTTSS2SIZrm_Int addr:$src)>;
def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
- (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ (VCVTTSS2SI64Zrr_Int VR128X:$src)>;
+ def : Pat<(i64 (int_x86_sse_cvttss2si64 (sse_load_f32 addr:$src))),
+ (VCVTTSS2SI64Zrm_Int addr:$src)>;
def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
- (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
+ (VCVTTSD2SIZrr_Int VR128X:$src)>;
+ def : Pat<(i32 (int_x86_sse2_cvttsd2si (sse_load_f64 addr:$src))),
+ (VCVTTSD2SIZrm_Int addr:$src)>;
def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
- (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>;
-
+ (VCVTTSD2SI64Zrr_Int VR128X:$src)>;
+ def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (sse_load_f64 addr:$src))),
+ (VCVTTSD2SI64Zrm_Int addr:$src)>;
} // HasAVX512
//===----------------------------------------------------------------------===//
// AVX-512 Convert form float to double and back
@@ -5280,14 +6079,16 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2)))>,
+ (_Src.VT _Src.RC:$src2),
+ (i32 FROUND_CURRENT)))>,
EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
(_Src.VT (scalar_to_vector
- (_Src.ScalarLdFrag addr:$src2)))))>,
+ (_Src.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT)))>,
EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
@@ -5314,36 +6115,35 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
EVEX_B, EVEX_RC;
}
-multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
SDNode OpNodeRnd, X86VectorVTInfo _src,
X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
- OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>,
- EVEX_V512, XD;
+ OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
}
}
-multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
SDNode OpNodeRnd, X86VectorVTInfo _src,
X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
- EVEX_CD8<32, CD8VT1>, XS, EVEX_V512;
+ EVEX_CD8<32, CD8VT1>, XS;
}
}
-defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround,
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
X86froundRnd, f64x_info, f32x_info>;
-defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext,
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
X86fpextRnd,f32x_info, f64x_info >;
-def : Pat<(f64 (fextend FR32X:$src)),
+def : Pat<(f64 (fpextend FR32X:$src)),
(COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
(COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
Requires<[HasAVX512]>;
-def : Pat<(f64 (fextend (loadf32 addr:$src))),
+def : Pat<(f64 (fpextend (loadf32 addr:$src))),
(COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
Requires<[HasAVX512]>;
@@ -5356,10 +6156,25 @@ def : Pat<(f64 (extloadf32 addr:$src)),
(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
Requires<[HasAVX512, OptForSpeed]>;
-def : Pat<(f32 (fround FR64X:$src)),
+def : Pat<(f32 (fpround FR64X:$src)),
(COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
(COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
Requires<[HasAVX512]>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128X:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>,
+ Requires<[HasAVX512]>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128X:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>,
+ Requires<[HasAVX512]>;
+
//===----------------------------------------------------------------------===//
// AVX-512 Vector convert from signed/unsigned integer to float/double
// and from float/double to signed/unsigned integer
@@ -5368,14 +6183,14 @@ def : Pat<(f32 (fround FR64X:$src)),
multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNode,
string Broadcast = _.BroadcastStr,
- string Alias = ""> {
+ string Alias = "", X86MemOperand MemOp = _Src.MemOp> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src), OpcodeStr, "$src", "$src",
(_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _Src.MemOp:$src), OpcodeStr#Alias, "$src", "$src",
+ (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
(_.VT (OpNode (_Src.VT
(bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
@@ -5410,14 +6225,14 @@ multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
// Extend Float to Double
multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
X86vfpextRnd>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
- X86vfpext, "{1to2}">, EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>,
+ X86vfpext, "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend>,
EVEX_V256;
}
}
@@ -5425,15 +6240,24 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
// Truncate Double to Float
multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
X86vfproundRnd>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
X86vfpround, "{1to2}", "{x}">, EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround,
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
"{1to4}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
}
}
@@ -5446,6 +6270,12 @@ def : Pat<(v8f64 (extloadv8f32 addr:$src)),
(VCVTPS2PDZrm addr:$src)>;
let Predicates = [HasVLX] in {
+ let AddedComplexity = 15 in
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
+ (VCVTPD2PSZ128rr VR128X:$src)>;
+ def : Pat<(v2f64 (extloadv2f32 addr:$src)),
+ (VCVTPS2PDZ128rm addr:$src)>;
def : Pat<(v4f64 (extloadv4f32 addr:$src)),
(VCVTPS2PDZ256rm addr:$src)>;
}
@@ -5460,7 +6290,7 @@ multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
- OpNode128, "{1to2}">, EVEX_V128;
+ OpNode128, "{1to2}", "", i64mem>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
EVEX_V256;
}
@@ -5515,8 +6345,8 @@ multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
}
// Convert Double to Signed/Unsigned Doubleword with truncation
-multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNode128, SDNode OpNodeRnd> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
@@ -5524,13 +6354,22 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
}
let Predicates = [HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
- // memory forms of these instructions in Asm Parcer. They have the same
+ // memory forms of these instructions in Asm Parser. They have the same
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
- "{1to2}", "{x}">, EVEX_V128;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
+ OpNode128, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
"{1to4}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
}
}
@@ -5551,6 +6390,15 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
"{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
"{1to4}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
}
}
@@ -5614,15 +6462,15 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- "{1to2}">, EVEX_V128;
+ "{1to2}", "", f64mem>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Quardword with truncation
-multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNode128, SDNode OpNodeRnd> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
@@ -5631,16 +6479,16 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr,
let Predicates = [HasDQI, HasVLX] in {
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- "{1to2}">, EVEX_V128;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128,
+ "{1to2}", "", f64mem>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
EVEX_V256;
}
}
// Convert Signed/Unsigned Quardword to Float
-multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNode128, SDNode OpNodeRnd> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
@@ -5651,37 +6499,46 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr,
// memory forms of these instructions in Asm Parcer. They have the same
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode,
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
"{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
"{1to4}", "{y}">, EVEX_V256;
+
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
}
}
-defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86cvtdq2pd>, XS,
- EVEX_CD8<32, CD8VH>;
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP>,
+ XS, EVEX_CD8<32, CD8VH>;
defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
X86VSintToFpRnd>,
PS, EVEX_CD8<32, CD8VF>;
defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
- X86VFpToSintRnd>,
+ X86cvttp2siRnd>,
XS, EVEX_CD8<32, CD8VF>;
-defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint,
- X86VFpToSintRnd>,
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si,
+ X86cvttp2siRnd>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
- X86VFpToUintRnd>, PS,
+ X86cvttp2uiRnd>, PS,
EVEX_CD8<32, CD8VF>;
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
- X86VFpToUintRnd>, PS, VEX_W,
+ X86cvttp2ui, X86cvttp2uiRnd>, PS, VEX_W,
EVEX_CD8<64, CD8VF>;
-defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86cvtudq2pd>,
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86VUintToFP>,
XS, EVEX_CD8<32, CD8VH>;
defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
@@ -5717,18 +6574,18 @@ defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
X86cvtp2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
- X86VFpToSintRnd>, VEX_W,
+ X86cvttp2siRnd>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
-defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint,
- X86VFpToSintRnd>, PD, EVEX_CD8<32, CD8VH>;
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si,
+ X86cvttp2siRnd>, PD, EVEX_CD8<32, CD8VH>;
defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
- X86VFpToUintRnd>, VEX_W,
+ X86cvttp2uiRnd>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
-defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint,
- X86VFpToUintRnd>, PD, EVEX_CD8<32, CD8VH>;
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui,
+ X86cvttp2uiRnd>, PD, EVEX_CD8<32, CD8VH>;
defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
X86VSintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
@@ -5736,45 +6593,151 @@ defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
X86VUintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
-defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
X86VSintToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
-defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
X86VUintToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
- (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
- (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))),
(EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
- (v8f64 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_xmm)>;
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))),
+ (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src, sub_xmm)))), sub_xmm)>;
def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
(EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
- (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
(EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
- (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
(EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
- (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>;
+ (v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+ (v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+}
+
+let Predicates = [HasAVX512, HasVLX] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
+ (VCVTPD2DQZ128rr VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))))),
+ (VCVTPD2UDQZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
+ (VCVTTPD2DQZ128rr VR128X:$src)>;
+ def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))))),
+ (VCVTTPD2UDQZ128rr VR128X:$src)>;
+ }
}
let Predicates = [HasAVX512] in {
- def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
+ def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
(VCVTPD2PSZrm addr:$src)>;
def : Pat<(v8f64 (extloadv8f32 addr:$src)),
(VCVTPS2PDZrm addr:$src)>;
}
+let Predicates = [HasDQI, HasVLX] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
+ (VCVTQQ2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
+ (VCVTUQQ2PSZ128rr VR128X:$src)>;
+ }
+}
+
+let Predicates = [HasDQI, NoVLX] in {
+def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr
+ (v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr
+ (v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
+ (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
+ (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+ VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+
//===----------------------------------------------------------------------===//
// Half precision conversion instructions
//===----------------------------------------------------------------------===//
@@ -5816,14 +6779,13 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph", "$src2, $src1", "$src1, $src2",
(X86cvtps2ph (_src.VT _src.RC:$src1),
- (i32 imm:$src2),
- (i32 FROUND_CURRENT)),
- NoItinerary, 0, X86select>, AVX512AIi8Base;
+ (i32 imm:$src2)),
+ NoItinerary, 0, 0, X86select>, AVX512AIi8Base;
def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
(ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
- (i32 imm:$src2), (i32 FROUND_CURRENT) )),
+ (i32 imm:$src2))),
addr:$dst)]>;
let hasSideEffects = 0, mayStore = 1 in
def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
@@ -5832,13 +6794,12 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
[]>, EVEX_K;
}
multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
- defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+ let hasSideEffects = 0 in
+ defm rb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
+ (outs _dest.RC:$dst),
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2",
- (X86cvtps2ph (_src.VT _src.RC:$src1),
- (i32 imm:$src2),
- (i32 FROUND_NO_EXC)),
- NoItinerary, 0, X86select>, EVEX_B, AVX512AIi8Base;
+ []>, EVEX_B, AVX512AIi8Base;
}
let Predicates = [HasAVX512] in {
defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
@@ -5852,25 +6813,72 @@ let Predicates = [HasAVX512] in {
}
}
+// Patterns for matching conversions from float to half-float and vice versa.
+let Predicates = [HasVLX] in {
+ // Use MXCSR.RC for rounding instead of explicitly specifying the default
+ // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
+ // configurations we support (the default). However, falling back to MXCSR is
+ // more consistent with other instructions, which are always controlled by it.
+ // It's encoded as 0b100.
+ def : Pat<(fp_to_f16 FR32X:$src),
+ (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (VCVTPS2PHZ128rr
+ (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), sub_16bit))>;
+
+ def : Pat<(f16_to_fp GR16:$src),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
+ (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), FR32X)) >;
+
+ def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
+ (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
+ (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
+}
+
+// Patterns for matching float to half-float conversion when AVX512 is supported
+// but F16C isn't. In that case we have to use 512-bit vectors.
+let Predicates = [HasAVX512, NoVLX, NoF16C] in {
+ def : Pat<(fp_to_f16 FR32X:$src),
+ (i16 (EXTRACT_SUBREG
+ (VMOVPDI2DIZrr
+ (v8i16 (EXTRACT_SUBREG
+ (VCVTPS2PHZrr
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
+ sub_xmm), 4), sub_xmm))), sub_16bit))>;
+
+ def : Pat<(f16_to_fp GR16:$src),
+ (f32 (COPY_TO_REGCLASS
+ (v4f32 (EXTRACT_SUBREG
+ (VCVTPH2PSZrr
+ (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
+ (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
+ sub_xmm)), sub_xmm)), FR32X))>;
+
+ def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
+ (f32 (COPY_TO_REGCLASS
+ (v4f32 (EXTRACT_SUBREG
+ (VCVTPH2PSZrr
+ (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+ (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
+ sub_xmm), 4)), sub_xmm)), FR32X))>;
+}
+
// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
-multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode,
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr> {
def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
- [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2,
- (i32 FROUND_NO_EXC)))],
- IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+ [], IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
Sched<[WriteFAdd]>;
}
let Defs = [EFLAGS], Predicates = [HasAVX512] in {
- defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">,
+ defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss">,
AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
- defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">,
+ defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd">,
AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
- defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">,
+ defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss">,
AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
- defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">,
+ defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd">,
AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
}
@@ -5890,18 +6898,18 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
let isCodeGenOnly = 1 in {
- defm Int_VUCOMISSZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
- load, "ucomiss">, PS, EVEX, VEX_LIG,
+ defm Int_VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss">, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
- defm Int_VUCOMISDZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
- load, "ucomisd">, PD, EVEX,
+ defm Int_VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd">, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
- defm Int_VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
- load, "comiss">, PS, EVEX, VEX_LIG,
+ defm Int_VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss">, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
- defm Int_VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
- load, "comisd">, PD, EVEX,
+ defm Int_VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd">, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
}
@@ -6275,7 +7283,7 @@ defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W
multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
X86MemOperand x86memop> {
-
+ let ExeDomain = DestInfo.ExeDomain in
defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
@@ -6301,7 +7309,8 @@ multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
DestInfo.KRCWM:$mask ,
SrcInfo.RC:$src1)>;
- let mayStore = 1, mayLoad = 1, hasSideEffects = 0 in {
+ let mayStore = 1, mayLoad = 1, hasSideEffects = 0,
+ ExeDomain = DestInfo.ExeDomain in {
def mr : AVX512XS8I<opc, MRMDestMem, (outs),
(ins x86memop:$dst, SrcInfo.RC:$src),
OpcodeStr # "\t{$src, $dst|$dst, $src}",
@@ -6328,23 +7337,6 @@ multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
}
-multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo,
- X86VectorVTInfo DestInfo, string sat > {
-
- def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
- DestInfo.Suffix#"_mem_"#SrcInfo.Size)
- addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask),
- (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr,
- (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM),
- (SrcInfo.VT SrcInfo.RC:$src))>;
-
- def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix#
- DestInfo.Suffix#"_mem_"#SrcInfo.Size)
- addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1),
- (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr,
- (SrcInfo.VT SrcInfo.RC:$src))>;
-}
-
multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
@@ -6370,140 +7362,111 @@ multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
truncFrag, mtruncFrag>, EVEX_V512;
}
-multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
- X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
- X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
- X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{
-
- let Predicates = [HasVLX, prd] in {
- defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
- DestInfoZ128, x86memopZ128>,
- avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
- sat>, EVEX_V128;
-
- defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
- DestInfoZ256, x86memopZ256>,
- avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
- sat>, EVEX_V256;
- }
- let Predicates = [prd] in
- defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
- DestInfoZ, x86memopZ>,
- avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ,
- sat>, EVEX_V512;
-}
-
-multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
- truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>;
-}
-multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> {
- defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info,
- v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
- sat>, EVEX_CD8<8, CD8VO>;
+ StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
}
-multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
- truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>;
-}
-multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> {
- defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info,
- v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
- sat>, EVEX_CD8<16, CD8VQ>;
+ StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
}
-multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
- truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>;
-}
-multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> {
- defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info,
- v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
- sat>, EVEX_CD8<32, CD8VH>;
+ StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
}
-multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
- truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>;
-}
-multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> {
- defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info,
- v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
- sat>, EVEX_CD8<8, CD8VQ>;
+ StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
}
-multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
- truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>;
-}
-multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> {
- defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info,
- v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
- sat>, EVEX_CD8<16, CD8VH>;
+ StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
}
-multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ PatFrag StoreNode, PatFrag MaskedStoreNode> {
defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
- truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>;
-}
-multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> {
- defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info,
- v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
- sat, HasBWI>, EVEX_CD8<16, CD8VH>;
-}
-
-defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>;
-defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>;
-defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>;
-
-defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>;
-defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>;
-defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>;
-
-defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>;
-defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>;
-defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>;
-
-defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>;
-defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>;
-defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>;
-
-defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>;
-defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>;
-defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>;
-
-defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>;
-defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>;
-defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>;
+ StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+
+defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc,
+ truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs,
+ truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus,
+ truncstore_us_vi8, masked_truncstore_us_vi8>;
+
+defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc,
+ truncstorevi16, masked_truncstorevi16>;
+defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs,
+ truncstore_s_vi16, masked_truncstore_s_vi16>;
+defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus,
+ truncstore_us_vi16, masked_truncstore_us_vi16>;
+
+defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc,
+ truncstorevi32, masked_truncstorevi32>;
+defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs,
+ truncstore_s_vi32, masked_truncstore_s_vi32>;
+defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus,
+ truncstore_us_vi32, masked_truncstore_us_vi32>;
+
+defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc,
+ truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs,
+ truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus,
+ truncstore_us_vi8, masked_truncstore_us_vi8>;
+
+defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc,
+ truncstorevi16, masked_truncstorevi16>;
+defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs,
+ truncstore_s_vi16, masked_truncstore_s_vi16>;
+defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus,
+ truncstore_us_vi16, masked_truncstore_us_vi16>;
+
+defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc,
+ truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs,
+ truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
+ truncstore_us_vi8, masked_truncstore_us_vi8>;
let Predicates = [HasAVX512, NoVLX] in {
def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
(v8i16 (EXTRACT_SUBREG
- (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0),
+ (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm)))), sub_xmm))>;
def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
(v4i32 (EXTRACT_SUBREG
- (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0),
+ (v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm)))), sub_xmm))>;
}
let Predicates = [HasBWI, NoVLX] in {
def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
- (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0),
+ (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm))), sub_xmm))>;
}
multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{
+ let ExeDomain = DestInfo.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
@@ -6513,6 +7476,7 @@ multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
(ins x86memop:$src), OpcodeStr ,"$src", "$src",
(DestInfo.VT (LdFrag addr:$src))>,
EVEX;
+ }
}
multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
@@ -6685,6 +7649,150 @@ let Predicates = [HasAVX512] in {
defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>;
}
+multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy,
+ SDNode ExtOp, PatFrag ExtLoad16> {
+ // 128-bit patterns
+ let Predicates = [HasVLX, HasBWI] in {
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+ }
+ let Predicates = [HasVLX] in {
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+ }
+ // 256-bit patterns
+ let Predicates = [HasVLX, HasBWI] in {
+ def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+ def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+ }
+ let Predicates = [HasVLX] in {
+ def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+
+ def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+
+ def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+ }
+ // 512-bit patterns
+ let Predicates = [HasBWI] in {
+ def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
+ }
+ let Predicates = [HasAVX512] in {
+ def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
+
+ def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+ def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+
+ def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
+
+ def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
+
+ def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))),
+ (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
+ }
+}
+
+defm : AVX512_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+
//===----------------------------------------------------------------------===//
// GATHER - SCATTER Operations
@@ -6859,8 +7967,14 @@ defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd
VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
// Helper fragments to match sext vXi1 to vXiY.
-def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
-def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
+def v64i1sextv64i8 : PatLeaf<(v64i8
+ (X86vsext
+ (v64i1 (X86pcmpgtm
+ (bc_v64i8 (v16i32 immAllZerosV)),
+ VR512:$src))))>;
+def v32i1sextv32i16 : PatLeaf<(v32i16 (X86vsrai VR512:$src, (i8 15)))>;
+def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
+def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
@@ -6941,7 +8055,7 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
// AVX-512 - COMPRESS and EXPAND
//
-multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr> {
defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
@@ -6956,19 +8070,28 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _,
def mrk : AVX5128I<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
- [(store (_.VT (vselect _.KRCWM:$mask,
- (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)),
- addr:$dst)]>,
+ []>,
EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
}
+multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > {
+
+ def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask,
+ (_.VT _.RC:$src)),
+ (!cast<Instruction>(NAME#_.ZSuffix##mrk)
+ addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
+}
+
multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo VTInfo> {
- defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+ defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr>,
+ compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
- defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+ defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr>,
+ compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
+ defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr>,
+ compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
}
}
@@ -6995,13 +8118,28 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>;
}
+multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > {
+
+ def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmkz)
+ _.KRCWM:$mask, addr:$src)>;
+
+ def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
+ (_.VT _.RC:$src0))),
+ (!cast<Instruction>(NAME#_.ZSuffix##rmk)
+ _.RC:$src0, _.KRCWM:$mask, addr:$src)>;
+}
+
multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo VTInfo> {
- defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+ defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>,
+ expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
- defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+ defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>,
+ expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
+ defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>,
+ expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
}
}
@@ -7019,7 +8157,8 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
// op(broadcast(eltVt),imm)
//all instruction created with FROUND_CURRENT
multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
+ X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
@@ -7039,11 +8178,13 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo
(OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
(i32 imm:$src2),
(i32 FROUND_CURRENT))>, EVEX_B;
+ }
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
@@ -7073,7 +8214,8 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
// op(reg_vec2,broadcast(eltVt),imm)
//all instruction created with FROUND_CURRENT
multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
+ X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -7096,13 +8238,14 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
(i32 imm:$src3),
(i32 FROUND_CURRENT))>, EVEX_B;
+ }
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
// op(reg_vec2,mem_vec,imm)
multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
-
+ let ExeDomain = DestInfo.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -7116,6 +8259,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
(SrcInfo.VT (bitconvert
(SrcInfo.LdFrag addr:$src2))),
(i8 imm:$src3)))>;
+ }
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
@@ -7125,6 +8269,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _>:
avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
+ let ExeDomain = _.ExeDomain in
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
@@ -7138,8 +8283,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
// op(reg_vec2,mem_scalar,imm)
//all instruction created with FROUND_CURRENT
multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
-
+ X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -7148,25 +8293,20 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(i32 imm:$src3),
(i32 FROUND_CURRENT))>;
defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (scalar_to_vector
(_.ScalarLdFrag addr:$src2))),
(i32 imm:$src3),
(i32 FROUND_CURRENT))>;
-
- let isAsmParserOnly = 1, mayLoad = 1, hasSideEffects = 0 in {
- defm rmi_alt :AVX512_maskable_in_asm<opc, MRMSrcMem, _, (outs _.FRC:$dst),
- (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
- OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
- []>;
}
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86VectorVTInfo _>{
+ let ExeDomain = _.ExeDomain in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, {sae}, $src2, $src1",
@@ -7439,14 +8579,64 @@ multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
+def avx512_v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+ VR128X:$src))>;
+def avx512_v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128X:$src, (i8 15)))>;
+def avx512_v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128X:$src, (i8 31)))>;
+def avx512_v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+ VR256X:$src))>;
+def avx512_v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256X:$src, (i8 15)))>;
+def avx512_v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256X:$src, (i8 31)))>;
+
+let Predicates = [HasBWI, HasVLX] in {
+ def : Pat<(xor
+ (bc_v2i64 (avx512_v16i1sextv16i8)),
+ (bc_v2i64 (add (v16i8 VR128X:$src), (avx512_v16i1sextv16i8)))),
+ (VPABSBZ128rr VR128X:$src)>;
+ def : Pat<(xor
+ (bc_v2i64 (avx512_v8i1sextv8i16)),
+ (bc_v2i64 (add (v8i16 VR128X:$src), (avx512_v8i1sextv8i16)))),
+ (VPABSWZ128rr VR128X:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (avx512_v32i1sextv32i8)),
+ (bc_v4i64 (add (v32i8 VR256X:$src), (avx512_v32i1sextv32i8)))),
+ (VPABSBZ256rr VR256X:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (avx512_v16i1sextv16i16)),
+ (bc_v4i64 (add (v16i16 VR256X:$src), (avx512_v16i1sextv16i16)))),
+ (VPABSWZ256rr VR256X:$src)>;
+}
+let Predicates = [HasAVX512, HasVLX] in {
+ def : Pat<(xor
+ (bc_v2i64 (avx512_v4i1sextv4i32)),
+ (bc_v2i64 (add (v4i32 VR128X:$src), (avx512_v4i1sextv4i32)))),
+ (VPABSDZ128rr VR128X:$src)>;
+ def : Pat<(xor
+ (bc_v4i64 (avx512_v8i1sextv8i32)),
+ (bc_v4i64 (add (v8i32 VR256X:$src), (avx512_v8i1sextv8i32)))),
+ (VPABSDZ256rr VR256X:$src)>;
+}
+
+let Predicates = [HasAVX512] in {
def : Pat<(xor
- (bc_v16i32 (v16i1sextv16i32)),
- (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
+ (bc_v8i64 (v16i1sextv16i32)),
+ (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
(VPABSDZrr VR512:$src)>;
def : Pat<(xor
(bc_v8i64 (v8i1sextv8i64)),
(bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
(VPABSQZrr VR512:$src)>;
+}
+let Predicates = [HasBWI] in {
+def : Pat<(xor
+ (bc_v8i64 (v64i1sextv64i8)),
+ (bc_v8i64 (add (v64i8 VR512:$src), (v64i1sextv64i8)))),
+ (VPABSBZrr VR512:$src)>;
+def : Pat<(xor
+ (bc_v8i64 (v32i1sextv32i16)),
+ (bc_v8i64 (add (v32i16 VR512:$src), (v32i1sextv32i16)))),
+ (VPABSWZrr VR512:$src)>;
+}
multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
@@ -7503,16 +8693,44 @@ multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
+let Predicates = [HasVLX] in {
def : Pat<(X86Movddup (loadv2f64 addr:$src)),
- (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+ (VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
- (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>;
+ (VMOVDDUPZ128rm addr:$src)>;
+def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+ (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
+ (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+}
//===----------------------------------------------------------------------===//
// AVX-512 - Unpack Instructions
//===----------------------------------------------------------------------===//
-defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512>;
-defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512>;
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
+ SSE_ALU_ITINS_S>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
+ SSE_ALU_ITINS_S>;
defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
SSE_INTALU_ITINS_P, HasBWI>;
@@ -7730,22 +8948,22 @@ defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
HasBWI>, EVEX_4V;
multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
- let Constraints = "$src1 = $dst" in {
+ X86VectorVTInfo _>{
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT _.RC:$src3),
- (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V;
+ (i8 imm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (bitconvert (_.LdFrag addr:$src3))),
- (i8 imm:$src4))>,
+ (i8 imm:$src4)), 1, 0>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
@@ -7754,7 +8972,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- (i8 imm:$src4))>, EVEX_B,
+ (i8 imm:$src4)), 1, 0>, EVEX_B,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
}// Constraints = "$src1 = $dst"
}
@@ -7776,8 +8994,8 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
//===----------------------------------------------------------------------===//
multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
- let Constraints = "$src1 = $dst" in {
+ X86VectorVTInfo _>{
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -7807,8 +9025,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86VectorVTInfo _>{
-let Constraints = "$src1 = $dst" in {
+ SDNode OpNode, X86VectorVTInfo _>{
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -7823,7 +9041,8 @@ let Constraints = "$src1 = $dst" in {
multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, X86VectorVTInfo _src3VT> {
- let Constraints = "$src1 = $dst" , Predicates = [HasAVX512] in {
+ let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
+ ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -7877,3 +9096,135 @@ defm VFIXUPIMMPS : avx512_fixupimm_packed_all<avx512vl_f32_info>,
EVEX_CD8<32, CD8VF>;
defm VFIXUPIMMPD : avx512_fixupimm_packed_all<avx512vl_f64_info>,
EVEX_CD8<64, CD8VF>, VEX_W;
+
+
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// A[0] += B[0];
+// return A;
+// }
+//
+// Previously we generated:
+// addss %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+// __m128 foo(__m128 A, __m128 B) {
+// __m128 C = A + B;
+// return (__m128) {c[0], a[1], a[2], a[3]};
+// }
+//
+// Previously we generated:
+// addps %xmm0, %xmm1
+// movss %xmm1, %xmm0
+//
+// We now generate:
+// addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [HasAVX512] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
+ FR32X:$src))))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
+ FR32X:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+
+ // vector math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst),
+ (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst),
+ (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
+
+ // extracted masked scalar math op with insert via movss
+ def : Pat<(X86Movss (v4f32 VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))),
+ FR32X:$src2),
+ FR32X:$src0))),
+ (!cast<I>("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X),
+ VK1WM:$mask, v4f32:$src1,
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
+ }
+}
+
+defm : AVX512_scalar_math_f32_patterns<fadd, "ADD">;
+defm : AVX512_scalar_math_f32_patterns<fsub, "SUB">;
+defm : AVX512_scalar_math_f32_patterns<fmul, "MUL">;
+defm : AVX512_scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [HasAVX512] in {
+ // extracted scalar math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
+ FR64X:$src))))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
+ FR64X:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+ // vector math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst),
+ (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst),
+ (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // extracted masked scalar math op with insert via movss
+ def : Pat<(X86Movsd (v2f64 VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))),
+ FR64X:$src2),
+ FR64X:$src0))),
+ (!cast<I>("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X),
+ VK1WM:$mask, v2f64:$src1,
+ (COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
+ }
+}
+
+defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">;
+defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">;
+defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">;
+defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
index 1a2e786..bfd21c0 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -625,7 +625,7 @@ def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
1, OpSize32, 0>;
def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
- Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
+ Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su,
1, OpSizeFixed, 1>;
/// ITy - This instruction base class takes the type info for the instruction.
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
index bcea6fa..ba970bc 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -24,9 +24,15 @@
#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include <cassert>
namespace llvm {
@@ -57,12 +63,11 @@ struct X86AddressMode {
Base.Reg = 0;
}
-
void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
if (BaseType == X86AddressMode::RegBase)
- MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
+ MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, false,
false, false, false, 0, false));
else {
assert(BaseType == X86AddressMode::FrameIndexBase);
@@ -70,44 +75,45 @@ struct X86AddressMode {
}
MO.push_back(MachineOperand::CreateImm(Scale));
- MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
- false, false, false, 0, false));
+ MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, false, false,
+ false, false, 0, false));
if (GV)
MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
else
MO.push_back(MachineOperand::CreateImm(Disp));
- MO.push_back(MachineOperand::CreateReg(0, false, false,
- false, false, false, 0, false));
+ MO.push_back(MachineOperand::CreateReg(0, false, false, false, false, false,
+ false, 0, false));
}
};
/// Compute the addressing mode from an machine instruction starting with the
/// given operand.
-static inline X86AddressMode getAddressFromInstr(MachineInstr *MI,
+static inline X86AddressMode getAddressFromInstr(const MachineInstr *MI,
unsigned Operand) {
X86AddressMode AM;
- MachineOperand &Op = MI->getOperand(Operand);
- if (Op.isReg()) {
+ const MachineOperand &Op0 = MI->getOperand(Operand);
+ if (Op0.isReg()) {
AM.BaseType = X86AddressMode::RegBase;
- AM.Base.Reg = Op.getReg();
+ AM.Base.Reg = Op0.getReg();
} else {
AM.BaseType = X86AddressMode::FrameIndexBase;
- AM.Base.FrameIndex = Op.getIndex();
- }
- Op = MI->getOperand(Operand + 1);
- if (Op.isImm())
- AM.Scale = Op.getImm();
- Op = MI->getOperand(Operand + 2);
- if (Op.isImm())
- AM.IndexReg = Op.getImm();
- Op = MI->getOperand(Operand + 3);
- if (Op.isGlobal()) {
- AM.GV = Op.getGlobal();
- } else {
- AM.Disp = Op.getImm();
+ AM.Base.FrameIndex = Op0.getIndex();
}
+
+ const MachineOperand &Op1 = MI->getOperand(Operand + 1);
+ AM.Scale = Op1.getImm();
+
+ const MachineOperand &Op2 = MI->getOperand(Operand + 2);
+ AM.IndexReg = Op2.getReg();
+
+ const MachineOperand &Op3 = MI->getOperand(Operand + 3);
+ if (Op3.isGlobal())
+ AM.GV = Op3.getGlobal();
+ else
+ AM.Disp = Op3.getImm();
+
return AM;
}
@@ -122,12 +128,28 @@ addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
}
+/// Replace the address used in the instruction with the direct memory
+/// reference.
+static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand,
+ unsigned Reg) {
+ // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg.
+ MI->getOperand(Operand).setReg(Reg);
+ MI->getOperand(Operand + 1).setImm(1);
+ MI->getOperand(Operand + 2).setReg(0);
+ MI->getOperand(Operand + 3).setImm(0);
+ MI->getOperand(Operand + 4).setReg(0);
+}
static inline const MachineInstrBuilder &
addOffset(const MachineInstrBuilder &MIB, int Offset) {
return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
}
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) {
+ return MIB.addImm(1).addReg(0).addOperand(Offset).addReg(0);
+}
+
/// addRegOffset - This function is used to add a memory reference of the form
/// [Reg + Offset], i.e., one with no scale or index, but with a
/// displacement. An example is: DWORD PTR [EAX + 4].
@@ -177,7 +199,7 @@ static inline const MachineInstrBuilder &
addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
MachineInstr *MI = MIB;
MachineFunction &MF = *MI->getParent()->getParent();
- MachineFrameInfo &MFI = *MF.getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
const MCInstrDesc &MCID = MI->getDesc();
auto Flags = MachineMemOperand::MONone;
if (MCID.mayLoad())
@@ -206,6 +228,6 @@ addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
.addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
}
-} // End llvm namespace
+} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
index 925f4ef..3c27eb8 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -723,7 +723,7 @@ defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, -1, "dec">;
multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
SDPatternOperator frag, X86MemOperand x86memop,
InstrItinClass itin> {
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, usesCustomInserter = 1 in {
def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
!strconcat(mnemonic, "\t$ptr"),
[(frag addr:$ptr)], itin>, TB, LOCK;
@@ -1025,53 +1025,6 @@ def : Pat<(store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
def : Pat<(store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
}
-// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
-def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>;
-def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>;
-def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
-def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
-def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
-def : Pat<(i32 (X86Wrapper mcsym:$dst)), (MOV32ri mcsym:$dst)>;
-def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
-
-def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
- (ADD32ri GR32:$src1, tconstpool:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
- (ADD32ri GR32:$src1, tjumptable:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
- (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
- (ADD32ri GR32:$src1, texternalsym:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper mcsym:$src2)),
- (ADD32ri GR32:$src1, mcsym:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
- (ADD32ri GR32:$src1, tblockaddress:$src2)>;
-
-def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
- (MOV32mi addr:$dst, tglobaladdr:$src)>;
-def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
- (MOV32mi addr:$dst, texternalsym:$src)>;
-def : Pat<(store (i32 (X86Wrapper mcsym:$src)), addr:$dst),
- (MOV32mi addr:$dst, mcsym:$src)>;
-def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
- (MOV32mi addr:$dst, tblockaddress:$src)>;
-
-// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
-// code model mode, should use 'movabs'. FIXME: This is really a hack, the
-// 'movabs' predicate should handle this sort of thing.
-def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
- (MOV64ri tconstpool :$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
- (MOV64ri tjumptable :$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
- (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
- (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper mcsym:$dst)),
- (MOV64ri mcsym:$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
- (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
-
// In kernel code model, we can get the address of a label
// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
// the MOV64ri32 should accept these.
@@ -1289,15 +1242,13 @@ def : Pat<(i64 (anyext GR32:$src)),
// Any instruction that defines a 32-bit result leaves the high half of the
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. And x86's cmov doesn't do anything if the
-// condition is false. But any other 32-bit operation will zero-extend
+// be copying from a truncate. Any other 32-bit operation will zero-extend
// up to 64 bits.
def def32 : PatLeaf<(i32 GR32:$src), [{
return N->getOpcode() != ISD::TRUNCATE &&
N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
N->getOpcode() != ISD::CopyFromReg &&
- N->getOpcode() != ISD::AssertSext &&
- N->getOpcode() != X86ISD::CMOV;
+ N->getOpcode() != ISD::AssertSext;
}]>;
// In the case of a 32-bit def that is known to implicitly zero-extend,
@@ -1711,6 +1662,22 @@ defm : MaskedShiftAmountPats<sra, "SAR">;
defm : MaskedShiftAmountPats<rotl, "ROL">;
defm : MaskedShiftAmountPats<rotr, "ROR">;
+// Double shift amount is implicitly masked.
+multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
+ // (shift x (and y, 31)) ==> (shift x, y)
+ def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)),
+ (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
+ def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)),
+ (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;
+
+ // (shift x (and y, 63)) ==> (shift x, y)
+ def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)),
+ (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
+}
+
+defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
+defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
+
// (anyext (setcc_carry)) -> (setcc_carry)
def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
(SETB_C16r)>;
@@ -1719,9 +1686,6 @@ def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
(SETB_C32r)>;
-
-
-
//===----------------------------------------------------------------------===//
// EFLAGS-defining Patterns
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
index bb5f911..2f260c4 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrControl.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -239,7 +239,6 @@ let isCall = 1 in
// Tail call stuff.
-
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
let Uses = [ESP] in {
@@ -257,6 +256,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
(ins i32imm_pcrel:$dst),
"jmp\t$dst",
[], IIC_JMP_REL>;
+
def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
"", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
let mayLoad = 1 in
@@ -296,17 +296,18 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
SchedRW = [WriteJump] in {
- def TCRETURNdi64 : PseudoI<(outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$offset),
- []>;
- def TCRETURNri64 : PseudoI<(outs),
- (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ def TCRETURNdi64 : PseudoI<(outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+ []>;
+ def TCRETURNri64 : PseudoI<(outs),
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
let mayLoad = 1 in
- def TCRETURNmi64 : PseudoI<(outs),
- (ins i64mem_TC:$dst, i32imm:$offset), []>;
+ def TCRETURNmi64 : PseudoI<(outs),
+ (ins i64mem_TC:$dst, i32imm:$offset), []>;
def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
"jmp\t$dst", [], IIC_JMP_REL>;
+
def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
"jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
@@ -314,11 +315,8 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
"jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
- // Win64 wants jumps leaving the function to have a REX_W prefix.
+ // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
let hasREX_WPrefix = 1 in {
- def TAILJMPd64_REX : Ii32PCRel<0xE9, RawFrm, (outs),
- (ins i64i32imm_pcrel:$dst),
- "rex64 jmp\t$dst", [], IIC_JMP_REL>;
def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
"rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
index fd800cf..4b19f80 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -39,7 +39,6 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
PatFrag MemFrag128, PatFrag MemFrag256,
ValueType OpVT128, ValueType OpVT256,
SDPatternOperator Op = null_frag> {
- let usesCustomInserter = 1 in
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -55,8 +54,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
(MemFrag128 addr:$src3))))]>;
- let usesCustomInserter = 1 in
- def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+ def Yr : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -64,7 +62,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
VR256:$src3)))]>, VEX_L;
let mayLoad = 1 in
- def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+ def Ym : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -74,60 +72,61 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
}
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpcodeStr, string PackTy,
+ string OpcodeStr, string PackTy, string Suff,
PatFrag MemFrag128, PatFrag MemFrag256,
SDNode Op, ValueType OpTy128, ValueType OpTy256> {
- defm r213 : fma3p_rm<opc213,
- !strconcat(OpcodeStr, "213", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
- defm r132 : fma3p_rm<opc132,
- !strconcat(OpcodeStr, "132", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256>;
- defm r231 : fma3p_rm<opc231,
- !strconcat(OpcodeStr, "231", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256>;
+ defm NAME#213#Suff : fma3p_rm<opc213,
+ !strconcat(OpcodeStr, "213", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+ defm NAME#132#Suff : fma3p_rm<opc132,
+ !strconcat(OpcodeStr, "132", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
+ defm NAME#231#Suff : fma3p_rm<opc231,
+ !strconcat(OpcodeStr, "231", PackTy),
+ MemFrag128, MemFrag256, OpTy128, OpTy256>;
}
// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {
- defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
- loadv8f32, X86Fmadd, v4f32, v8f32>;
- defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
- loadv8f32, X86Fmsub, v4f32, v8f32>;
- defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
- loadv4f32, loadv8f32, X86Fmaddsub,
- v4f32, v8f32>;
- defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
- loadv4f32, loadv8f32, X86Fmsubadd,
- v4f32, v8f32>;
+ defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32>;
+ defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>;
+ defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmaddsub,
+ v4f32, v8f32>;
+ defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
+ loadv4f32, loadv8f32, X86Fmsubadd,
+ v4f32, v8f32>;
}
let ExeDomain = SSEPackedDouble in {
- defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64,
- loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
- defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64,
- loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
- defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
- loadv2f64, loadv4f64, X86Fmaddsub,
- v2f64, v4f64>, VEX_W;
- defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
- loadv2f64, loadv4f64, X86Fmsubadd,
- v2f64, v4f64>, VEX_W;
+ defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmadd, v2f64,
+ v4f64>, VEX_W;
+ defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmsub, v2f64,
+ v4f64>, VEX_W;
+ defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmaddsub,
+ v2f64, v4f64>, VEX_W;
+ defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD",
+ loadv2f64, loadv4f64, X86Fmsubadd,
+ v2f64, v4f64>, VEX_W;
}
// Fused Negative Multiply-Add
let ExeDomain = SSEPackedSingle in {
- defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", loadv4f32,
- loadv8f32, X86Fnmadd, v4f32, v8f32>;
- defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", loadv4f32,
- loadv8f32, X86Fnmsub, v4f32, v8f32>;
+ defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
+ loadv8f32, X86Fnmadd, v4f32, v8f32>;
+ defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
+ loadv8f32, X86Fnmsub, v4f32, v8f32>;
}
let ExeDomain = SSEPackedDouble in {
- defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,
- loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
- defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
- loadv2f64, loadv4f64, X86Fnmsub, v2f64,
- v4f64>, VEX_W;
+ defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
+ loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+ defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
+ loadv4f64, X86Fnmsub, v2f64, v4f64>, VEX_W;
}
// All source register operands of FMA opcodes defined in fma3s_rm multiclass
@@ -143,7 +142,6 @@ let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
SDPatternOperator OpNode = null_frag> {
- let usesCustomInserter = 1 in
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
@@ -191,13 +189,15 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
}
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, string PackTy,
+ string OpStr, string PackTy, string Suff,
SDNode OpNode, RegisterClass RC,
X86MemOperand x86memop> {
- defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
- defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
- OpNode>;
- defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>;
+ defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+ x86memop, RC>;
+ defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+ x86memop, RC, OpNode>;
+ defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+ x86memop, RC>;
}
// The FMA 213 form is created for lowering of scalar FMA intrinscis
@@ -210,42 +210,45 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// form of FMA*_Int instructions is done using an optimistic assumption that
// such analysis will be implemented eventually.
multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, string PackTy,
+ string OpStr, string PackTy, string Suff,
RegisterClass RC, Operand memop> {
- defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
- memop, RC>;
- defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
- memop, RC>;
- defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
- memop, RC>;
+ defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+ memop, RC>;
+ defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+ memop, RC>;
+ defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+ memop, RC>;
}
multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, Intrinsic IntF32, Intrinsic IntF64,
SDNode OpNode> {
let ExeDomain = SSEPackedSingle in
- defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
- FR32, f32mem>,
- fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
+ defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
+ FR32, f32mem>,
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS",
+ VR128, ssmem>;
let ExeDomain = SSEPackedDouble in
- defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
+ defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode,
FR64, f64mem>,
- fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
- VEX_W;
+ fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD",
+ VR128, sdmem>, VEX_W;
// These patterns use the 123 ordering, instead of 213, even though
// they match the intrinsic to the 213 version of the instruction.
// This is because src1 is tied to dest, and the scalar intrinsics
// require the pass-through values to come from the first source
// operand, not the second.
- def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
- (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int")
- $src1, $src2, $src3), VR128)>;
+ let Predicates = [HasFMA] in {
+ def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+ (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SSr_Int")
+ $src1, $src2, $src3), VR128)>;
- def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
- (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int")
- $src1, $src2, $src3), VR128)>;
+ def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+ (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SDr_Int")
+ $src1, $src2, $src3), VR128)>;
+ }
}
defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
@@ -268,18 +271,18 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
PatFrag mem_frag> {
let isCommutable = 1 in
- def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+ def rr : FMA4<opc, MRMSrcRegOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
- def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+ (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG;
+ def rm : FMA4<opc, MRMSrcMemOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (OpNode RC:$src1, RC:$src2,
- (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
+ (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG;
def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
!strconcat(OpcodeStr,
@@ -298,19 +301,18 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
ComplexPattern mem_cpat, Intrinsic Int> {
let isCodeGenOnly = 1 in {
- let isCommutable = 1 in
- def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4;
- def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG;
+ def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
- mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4;
+ mem_cpat:$src3))]>, VEX_W, VEX_LIG;
def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -324,19 +326,19 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT128, ValueType OpVT256,
PatFrag ld_frag128, PatFrag ld_frag256> {
let isCommutable = 1 in
- def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
- VEX_W, MemOp4;
- def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ VEX_W;
+ def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
- (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
+ (ld_frag128 addr:$src3)))]>, VEX_W;
def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -344,20 +346,20 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
let isCommutable = 1 in
- def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst,
(OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
- VEX_W, MemOp4, VEX_L;
- def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ VEX_W, VEX_L;
+ def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
- (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
- def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L;
+ def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -369,7 +371,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
- def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
new file mode 100644
index 0000000..db83497
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -0,0 +1,285 @@
+//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFMA3Info.h"
+#include "X86InstrInfo.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Threading.h"
+using namespace llvm;
+
+/// This flag is used in the method llvm::call_once() used below to make the
+/// initialization of the map 'OpcodeToGroup' thread safe.
+LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
+
+static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
+X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
+ return &*X86InstrFMA3InfoObj;
+}
+
+void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes,
+ const uint16_t *MemOpcodes, unsigned Attr) {
+ // Create a new instance of this class that would hold a group of FMA opcodes.
+ X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr);
+
+ // Add the references from indvidual opcodes to the group holding them.
+ assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+ !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] &&
+ !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) &&
+ "Duplication or rewrite of elements in OpcodeToGroup.");
+ OpcodeToGroup[RegOpcodes[0]] = G;
+ OpcodeToGroup[RegOpcodes[1]] = G;
+ OpcodeToGroup[RegOpcodes[2]] = G;
+ OpcodeToGroup[MemOpcodes[0]] = G;
+ OpcodeToGroup[MemOpcodes[1]] = G;
+ OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) {
+ // Create a new instance of this class that would hold a group of FMA opcodes.
+ X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr);
+
+ // Add the references from indvidual opcodes to the group holding them.
+ assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+ !OpcodeToGroup[RegOpcodes[2]]) &&
+ "Duplication or rewrite of elements in OpcodeToGroup.");
+ OpcodeToGroup[RegOpcodes[0]] = G;
+ OpcodeToGroup[RegOpcodes[1]] = G;
+ OpcodeToGroup[RegOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) {
+ // Create a new instance of this class that would hold a group of FMA opcodes.
+ X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr);
+
+ // Add the references from indvidual opcodes to the group holding them.
+ assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] &&
+ !OpcodeToGroup[MemOpcodes[2]]) &&
+ "Duplication or rewrite of elements in OpcodeToGroup.");
+ OpcodeToGroup[MemOpcodes[0]] = G;
+ OpcodeToGroup[MemOpcodes[1]] = G;
+ OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+#define FMA3RM(R132, R213, R231, M132, M213, M231) \
+ static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
+ static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \
+ initRMGroup(Reg##R132, Mem##R132);
+
+#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs) \
+ static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
+ static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \
+ initRMGroup(Reg##R132, Mem##R132, (Attrs));
+
+#define FMA3R(R132, R213, R231) \
+ static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
+ initRGroup(Reg##R132);
+
+#define FMA3RA(R132, R213, R231, Attrs) \
+ static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
+ initRGroup(Reg##R132, (Attrs));
+
+#define FMA3M(M132, M213, M231) \
+ static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \
+ initMGroup(Mem##M132);
+
+#define FMA3MA(M132, M213, M231, Attrs) \
+ static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \
+ initMGroup(Mem##M132, (Attrs));
+
+#define FMA3_AVX2_VECTOR_GROUP(Name) \
+ FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr, \
+ Name##132PSm, Name##213PSm, Name##231PSm); \
+ FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr, \
+ Name##132PDm, Name##213PDm, Name##231PDm); \
+ FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr, \
+ Name##132PSYm, Name##213PSYm, Name##231PSYm); \
+ FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr, \
+ Name##132PDYm, Name##213PDYm, Name##231PDYm);
+
+#define FMA3_AVX2_SCALAR_GROUP(Name) \
+ FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr, \
+ Name##132SSm, Name##213SSm, Name##231SSm); \
+ FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr, \
+ Name##132SDm, Name##213SDm, Name##231SDm); \
+ FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int, \
+ Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int, \
+ Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic);
+
+#define FMA3_AVX2_FULL_GROUP(Name) \
+ FMA3_AVX2_VECTOR_GROUP(Name); \
+ FMA3_AVX2_SCALAR_GROUP(Name);
+
+#define FMA3_AVX512_VECTOR_GROUP(Name) \
+ FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r, \
+ Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m); \
+ FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r, \
+ Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m); \
+ FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r, \
+ Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m); \
+ FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r, \
+ Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m); \
+ FMA3RM(Name##132PSZr, Name##213PSZr, Name##231PSZr, \
+ Name##132PSZm, Name##213PSZm, Name##231PSZm); \
+ FMA3RM(Name##132PDZr, Name##213PDZr, Name##231PDZr, \
+ Name##132PDZm, Name##213PDZm, Name##231PDZm); \
+ FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk, \
+ Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk, \
+ Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk, \
+ Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk, \
+ Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PSZrk, Name##213PSZrk, Name##231PSZrk, \
+ Name##132PSZmk, Name##213PSZmk, Name##231PSZmk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PDZrk, Name##213PDZrk, Name##231PDZrk, \
+ Name##132PDZmk, Name##213PDZmk, Name##231PDZmk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz, \
+ Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz, \
+ Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz, \
+ Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz, \
+ Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PSZrkz, Name##213PSZrkz, Name##231PSZrkz, \
+ Name##132PSZmkz, Name##213PSZmkz, Name##231PSZmkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132PDZrkz, Name##213PDZrkz, Name##231PDZrkz, \
+ Name##132PDZmkz, Name##213PDZmkz, Name##231PDZmkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb); \
+ FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb); \
+ FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb); \
+ FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb); \
+ FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb); \
+ FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb); \
+ FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb); \
+ FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb); \
+ FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PSZmbk, Name##213PSZmbk, Name##231PSZmbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PDZmbk, Name##213PDZmbk, Name##231PDZmbk, \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz, \
+ X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_SCALAR_GROUP(Name) \
+ FMA3RM(Name##132SSZr, Name##213SSZr, Name##231SSZr, \
+ Name##132SSZm, Name##213SSZm, Name##231SSZm); \
+ FMA3RM(Name##132SDZr, Name##213SDZr, Name##231SDZr, \
+ Name##132SDZm, Name##213SDZm, Name##231SDZm); \
+ FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int, \
+ Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int, \
+ Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk, \
+ Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk, \
+ Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz, \
+ Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz, \
+ Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int, \
+ X86InstrFMA3Group::X86FMA3Intrinsic); \
+ FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KMergeMasked); \
+ FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KZeroMasked); \
+ FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz, \
+ X86InstrFMA3Group::X86FMA3Intrinsic | \
+ X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_FULL_GROUP(Name) \
+ FMA3_AVX512_VECTOR_GROUP(Name); \
+ FMA3_AVX512_SCALAR_GROUP(Name);
+
+void X86InstrFMA3Info::initGroupsOnceImpl() {
+ FMA3_AVX2_FULL_GROUP(VFMADD);
+ FMA3_AVX2_FULL_GROUP(VFMSUB);
+ FMA3_AVX2_FULL_GROUP(VFNMADD);
+ FMA3_AVX2_FULL_GROUP(VFNMSUB);
+
+ FMA3_AVX2_VECTOR_GROUP(VFMADDSUB);
+ FMA3_AVX2_VECTOR_GROUP(VFMSUBADD);
+
+ FMA3_AVX512_FULL_GROUP(VFMADD);
+ FMA3_AVX512_FULL_GROUP(VFMSUB);
+ FMA3_AVX512_FULL_GROUP(VFNMADD);
+ FMA3_AVX512_FULL_GROUP(VFNMSUB);
+
+ FMA3_AVX512_VECTOR_GROUP(VFMADDSUB);
+ FMA3_AVX512_VECTOR_GROUP(VFMSUBADD);
+}
+
+void X86InstrFMA3Info::initGroupsOnce() {
+ llvm::call_once(InitGroupsOnceFlag,
+ []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); });
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
new file mode 100644
index 0000000..025cee3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -0,0 +1,315 @@
+//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+
+#include "X86.h"
+#include "llvm/ADT/DenseMap.h"
+#include <cassert>
+#include <set>
+
+namespace llvm {
+/// This class is used to group {132, 213, 231} forms of FMA opcodes together.
+/// Each of the groups has either 3 register opcodes, 3 memory opcodes,
+/// or 6 register and memory opcodes. Also, each group has an attrubutes field
+/// describing it.
+class X86InstrFMA3Group {
+private:
+ /// Reference to an array holding 3 forms of register FMA opcodes.
+ /// It may be set to nullptr if the group of FMA opcodes does not have
+ /// any register form opcodes.
+ const uint16_t *RegOpcodes;
+
+ /// Reference to an array holding 3 forms of memory FMA opcodes.
+ /// It may be set to nullptr if the group of FMA opcodes does not have
+ /// any register form opcodes.
+ const uint16_t *MemOpcodes;
+
+ /// This bitfield specifies the attributes associated with the created
+ /// FMA groups of opcodes.
+ unsigned Attributes;
+
+ static const unsigned Form132 = 0;
+ static const unsigned Form213 = 1;
+ static const unsigned Form231 = 2;
+
+public:
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of FMA intrinsic opcodes.
+ static const unsigned X86FMA3Intrinsic = 0x1;
+
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
+ /// passing the elements from the 1st operand to the result of the operation
+ /// when the correpondings bits in the k-mask are unset.
+ static const unsigned X86FMA3KMergeMasked = 0x2;
+
+ /// This bit must be set in the 'Attributes' field of FMA group if such
+ /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
+ static const unsigned X86FMA3KZeroMasked = 0x4;
+
+ /// Constructor. Creates a new group of FMA opcodes with three register form
+ /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes.
+ /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr,
+ /// which means that the created group of FMA opcodes does not have the
+ /// corresponding (register or memory) opcodes.
+ /// The parameter \p Attr specifies the attributes describing the created
+ /// group.
+ X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes,
+ unsigned Attr)
+ : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) {
+ assert((RegOpcodes || MemOpcodes) &&
+ "Cannot create a group not having any opcodes.");
+ }
+
+ /// Returns a memory form opcode that is the equivalent of the given register
+ /// form opcode \p RegOpcode. 0 is returned if the group does not have
+ /// either register of memory opcodes.
+ unsigned getMemOpcode(unsigned RegOpcode) const {
+ if (!RegOpcodes || !MemOpcodes)
+ return 0;
+ for (unsigned Form = 0; Form < 3; Form++)
+ if (RegOpcodes[Form] == RegOpcode)
+ return MemOpcodes[Form];
+ return 0;
+ }
+
+ /// Returns the 132 form of FMA register opcode.
+ unsigned getReg132Opcode() const {
+ assert(RegOpcodes && "The group does not have register opcodes.");
+ return RegOpcodes[Form132];
+ }
+
+ /// Returns the 213 form of FMA register opcode.
+ unsigned getReg213Opcode() const {
+ assert(RegOpcodes && "The group does not have register opcodes.");
+ return RegOpcodes[Form213];
+ }
+
+ /// Returns the 231 form of FMA register opcode.
+ unsigned getReg231Opcode() const {
+ assert(RegOpcodes && "The group does not have register opcodes.");
+ return RegOpcodes[Form231];
+ }
+
+ /// Returns the 132 form of FMA memory opcode.
+ unsigned getMem132Opcode() const {
+ assert(MemOpcodes && "The group does not have memory opcodes.");
+ return MemOpcodes[Form132];
+ }
+
+ /// Returns the 213 form of FMA memory opcode.
+ unsigned getMem213Opcode() const {
+ assert(MemOpcodes && "The group does not have memory opcodes.");
+ return MemOpcodes[Form213];
+ }
+
+ /// Returns the 231 form of FMA memory opcode.
+ unsigned getMem231Opcode() const {
+ assert(MemOpcodes && "The group does not have memory opcodes.");
+ return MemOpcodes[Form231];
+ }
+
+ /// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
+ bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; }
+
+ /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
+ bool isKMergeMasked() const {
+ return (Attributes & X86FMA3KMergeMasked) != 0;
+ }
+
+ /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
+ bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; }
+
+ /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
+ bool isKMasked() const {
+ return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0;
+ }
+
+ /// Returns true iff the given \p Opcode is a register opcode from the
+ /// groups of FMA opcodes.
+ bool isRegOpcodeFromGroup(unsigned Opcode) const {
+ if (!RegOpcodes)
+ return false;
+ for (unsigned Form = 0; Form < 3; Form++)
+ if (Opcode == RegOpcodes[Form])
+ return true;
+ return false;
+ }
+
+ /// Returns true iff the given \p Opcode is a memory opcode from the
+ /// groups of FMA opcodes.
+ bool isMemOpcodeFromGroup(unsigned Opcode) const {
+ if (!MemOpcodes)
+ return false;
+ for (unsigned Form = 0; Form < 3; Form++)
+ if (Opcode == MemOpcodes[Form])
+ return true;
+ return false;
+ }
+};
+
+/// This class provides information about all existing FMA3 opcodes
+///
+class X86InstrFMA3Info {
+private:
+ /// A map that is used to find the group of FMA opcodes using any FMA opcode
+ /// from the group.
+ DenseMap<unsigned, const X86InstrFMA3Group *> OpcodeToGroup;
+
+ /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+ /// This method can be called many times, but the actual initialization is
+ /// called only once.
+ static void initGroupsOnce();
+
+ /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+ /// This method must be called ONLY from initGroupsOnce(). Otherwise, such
+ /// call is not thread safe.
+ void initGroupsOnceImpl();
+
+ /// Creates one group of FMA opcodes having the register opcodes
+ /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr
+ /// specifies the attributes describing the created group.
+ void initRMGroup(const uint16_t *RegOpcodes,
+ const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+ /// Creates one group of FMA opcodes having only the register opcodes
+ /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing
+ /// the created group.
+ void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0);
+
+ /// Creates one group of FMA opcodes having only the memory opcodes
+ /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing
+ /// the created group.
+ void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+public:
+ /// Returns the reference to an object of this class. It is assumed that
+ /// only one object may exist.
+ static X86InstrFMA3Info *getX86InstrFMA3Info();
+
+ /// Constructor. Just creates an object of the class.
+ X86InstrFMA3Info() {}
+
+ /// Destructor. Deallocates the memory used for FMA3 Groups.
+ ~X86InstrFMA3Info() {
+ std::set<const X86InstrFMA3Group *> DeletedGroups;
+ auto E = OpcodeToGroup.end();
+ for (auto I = OpcodeToGroup.begin(); I != E; I++) {
+ const X86InstrFMA3Group *G = I->second;
+ if (DeletedGroups.find(G) == DeletedGroups.end()) {
+ DeletedGroups.insert(G);
+ delete G;
+ }
+ }
+ }
+
+ /// Returns a reference to a group of FMA3 opcodes to where the given
+ /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+ /// and not included into any FMA3 group, then nullptr is returned.
+ static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) {
+ // Ensure that the groups of opcodes are initialized.
+ initGroupsOnce();
+
+ // Find the group including the given opcode.
+ const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+ auto I = FMA3Info->OpcodeToGroup.find(Opcode);
+ if (I == FMA3Info->OpcodeToGroup.end())
+ return nullptr;
+
+ return I->second;
+ }
+
+ /// Returns true iff the given \p Opcode is recognized as FMA3 by this class.
+ static bool isFMA3(unsigned Opcode) {
+ return getFMA3Group(Opcode) != nullptr;
+ }
+
+ /// Iterator that is used to walk on FMA register opcodes having memory
+ /// form equivalents.
+ class rm_iterator {
+ private:
+ /// Iterator associated with the OpcodeToGroup map. It must always be
+ /// initialized with an entry from OpcodeToGroup for which I->first
+ /// points to a register FMA opcode and I->second points to a group of
+ /// FMA opcodes having memory form equivalent of I->first.
+ DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I;
+
+ public:
+ /// Constructor. Creates rm_iterator. The parameter \p I must be an
+ /// iterator to OpcodeToGroup map entry having I->first pointing to
+ /// register form FMA opcode and I->second pointing to a group of FMA
+ /// opcodes holding memory form equivalent for I->fist.
+ rm_iterator(DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I)
+ : I(I) {}
+
+ /// Returns the register form FMA opcode.
+ unsigned getRegOpcode() const { return I->first; };
+
+ /// Returns the memory form equivalent opcode for FMA register opcode
+ /// referenced by I->first.
+ unsigned getMemOpcode() const {
+ unsigned Opcode = I->first;
+ const X86InstrFMA3Group *Group = I->second;
+ return Group->getMemOpcode(Opcode);
+ }
+
+ /// Returns a reference to a group of FMA opcodes.
+ const X86InstrFMA3Group *getGroup() const { return I->second; }
+
+ bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; }
+ bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; }
+
+ /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry
+ /// having I->first pointing to register form FMA and I->second pointing
+ /// to a group of FMA opcodes holding memory form equivalen for I->first.
+ rm_iterator &operator++() {
+ auto E = getX86InstrFMA3Info()->OpcodeToGroup.end();
+ for (++I; I != E; ++I) {
+ unsigned RegOpcode = I->first;
+ const X86InstrFMA3Group *Group = I->second;
+ if (Group->getMemOpcode(RegOpcode) != 0)
+ break;
+ }
+ return *this;
+ }
+ };
+
+ /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map
+ /// with a register FMA opcode having memory form opcode equivalent.
+ static rm_iterator rm_begin() {
+ initGroupsOnce();
+ const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+ auto I = FMA3Info->OpcodeToGroup.begin();
+ auto E = FMA3Info->OpcodeToGroup.end();
+ while (I != E) {
+ unsigned Opcode = I->first;
+ const X86InstrFMA3Group *G = I->second;
+ if (G->getMemOpcode(Opcode) != 0)
+ break;
+ I++;
+ }
+ return rm_iterator(I);
+ }
+
+ /// Returns the last rm_iterator.
+ static rm_iterator rm_end() {
+ initGroupsOnce();
+ return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
+ }
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
index 078dab4..10f3839 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -711,19 +711,19 @@ def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
// FP extensions map onto simple pseudo-value conversions if they are to/from
// the FP stack.
-def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
Requires<[FPStackf32]>;
-def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
Requires<[FPStackf32]>;
-def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
Requires<[FPStackf64]>;
// FP truncations map onto simple pseudo-value conversions if they are to/from
// the FP stack. We have validated that only value-preserving truncations make
// it through isel.
-def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
Requires<[FPStackf32]>;
-def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
Requires<[FPStackf32]>;
-def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
index 5183adc..610756a 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -18,43 +18,53 @@ class Format<bits<7> val> {
bits<7> Value = val;
}
-def Pseudo : Format<0>; def RawFrm : Format<1>;
-def AddRegFrm : Format<2>; def MRMDestReg : Format<3>;
-def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>;
-def MRMSrcMem : Format<6>; def RawFrmMemOffs : Format<7>;
-def RawFrmSrc : Format<8>; def RawFrmDst : Format<9>;
-def RawFrmDstSrc: Format<10>;
-def RawFrmImm8 : Format<11>;
-def RawFrmImm16 : Format<12>;
-def MRMXr : Format<14>; def MRMXm : Format<15>;
-def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>;
-def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>;
-def MRM6r : Format<22>; def MRM7r : Format<23>;
-def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>;
-def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>;
-def MRM6m : Format<30>; def MRM7m : Format<31>;
-def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>;
-def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C5 : Format<37>;
-def MRM_C6 : Format<38>; def MRM_C7 : Format<39>; def MRM_C8 : Format<40>;
-def MRM_C9 : Format<41>; def MRM_CA : Format<42>; def MRM_CB : Format<43>;
-def MRM_CC : Format<44>; def MRM_CD : Format<45>; def MRM_CE : Format<46>;
-def MRM_CF : Format<47>; def MRM_D0 : Format<48>; def MRM_D1 : Format<49>;
-def MRM_D2 : Format<50>; def MRM_D3 : Format<51>; def MRM_D4 : Format<52>;
-def MRM_D5 : Format<53>; def MRM_D6 : Format<54>; def MRM_D7 : Format<55>;
-def MRM_D8 : Format<56>; def MRM_D9 : Format<57>; def MRM_DA : Format<58>;
-def MRM_DB : Format<59>; def MRM_DC : Format<60>; def MRM_DD : Format<61>;
-def MRM_DE : Format<62>; def MRM_DF : Format<63>; def MRM_E0 : Format<64>;
-def MRM_E1 : Format<65>; def MRM_E2 : Format<66>; def MRM_E3 : Format<67>;
-def MRM_E4 : Format<68>; def MRM_E5 : Format<69>; def MRM_E6 : Format<70>;
-def MRM_E7 : Format<71>; def MRM_E8 : Format<72>; def MRM_E9 : Format<73>;
-def MRM_EA : Format<74>; def MRM_EB : Format<75>; def MRM_EC : Format<76>;
-def MRM_ED : Format<77>; def MRM_EE : Format<78>; def MRM_EF : Format<79>;
-def MRM_F0 : Format<80>; def MRM_F1 : Format<81>; def MRM_F2 : Format<82>;
-def MRM_F3 : Format<83>; def MRM_F4 : Format<84>; def MRM_F5 : Format<85>;
-def MRM_F6 : Format<86>; def MRM_F7 : Format<87>; def MRM_F8 : Format<88>;
-def MRM_F9 : Format<89>; def MRM_FA : Format<90>; def MRM_FB : Format<91>;
-def MRM_FC : Format<92>; def MRM_FD : Format<93>; def MRM_FE : Format<94>;
-def MRM_FF : Format<95>;
+def Pseudo : Format<0>;
+def RawFrm : Format<1>;
+def AddRegFrm : Format<2>;
+def RawFrmMemOffs : Format<3>;
+def RawFrmSrc : Format<4>;
+def RawFrmDst : Format<5>;
+def RawFrmDstSrc : Format<6>;
+def RawFrmImm8 : Format<7>;
+def RawFrmImm16 : Format<8>;
+def MRMDestMem : Format<32>;
+def MRMSrcMem : Format<33>;
+def MRMSrcMem4VOp3 : Format<34>;
+def MRMSrcMemOp4 : Format<35>;
+def MRMXm : Format<39>;
+def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>;
+def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>;
+def MRM6m : Format<46>; def MRM7m : Format<47>;
+def MRMDestReg : Format<48>;
+def MRMSrcReg : Format<49>;
+def MRMSrcReg4VOp3 : Format<50>;
+def MRMSrcRegOp4 : Format<51>;
+def MRMXr : Format<55>;
+def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>;
+def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>;
+def MRM6r : Format<62>; def MRM7r : Format<63>;
+def MRM_C0 : Format<64>; def MRM_C1 : Format<65>; def MRM_C2 : Format<66>;
+def MRM_C3 : Format<67>; def MRM_C4 : Format<68>; def MRM_C5 : Format<69>;
+def MRM_C6 : Format<70>; def MRM_C7 : Format<71>; def MRM_C8 : Format<72>;
+def MRM_C9 : Format<73>; def MRM_CA : Format<74>; def MRM_CB : Format<75>;
+def MRM_CC : Format<76>; def MRM_CD : Format<77>; def MRM_CE : Format<78>;
+def MRM_CF : Format<79>; def MRM_D0 : Format<80>; def MRM_D1 : Format<81>;
+def MRM_D2 : Format<82>; def MRM_D3 : Format<83>; def MRM_D4 : Format<84>;
+def MRM_D5 : Format<85>; def MRM_D6 : Format<86>; def MRM_D7 : Format<87>;
+def MRM_D8 : Format<88>; def MRM_D9 : Format<89>; def MRM_DA : Format<90>;
+def MRM_DB : Format<91>; def MRM_DC : Format<92>; def MRM_DD : Format<93>;
+def MRM_DE : Format<94>; def MRM_DF : Format<95>; def MRM_E0 : Format<96>;
+def MRM_E1 : Format<97>; def MRM_E2 : Format<98>; def MRM_E3 : Format<99>;
+def MRM_E4 : Format<100>; def MRM_E5 : Format<101>; def MRM_E6 : Format<102>;
+def MRM_E7 : Format<103>; def MRM_E8 : Format<104>; def MRM_E9 : Format<105>;
+def MRM_EA : Format<106>; def MRM_EB : Format<107>; def MRM_EC : Format<108>;
+def MRM_ED : Format<109>; def MRM_EE : Format<110>; def MRM_EF : Format<111>;
+def MRM_F0 : Format<112>; def MRM_F1 : Format<113>; def MRM_F2 : Format<114>;
+def MRM_F3 : Format<115>; def MRM_F4 : Format<116>; def MRM_F5 : Format<117>;
+def MRM_F6 : Format<118>; def MRM_F7 : Format<119>; def MRM_F8 : Format<120>;
+def MRM_F9 : Format<121>; def MRM_FA : Format<122>; def MRM_FB : Format<123>;
+def MRM_FC : Format<124>; def MRM_FD : Format<125>; def MRM_FE : Format<126>;
+def MRM_FF : Format<127>;
// ImmType - This specifies the immediate type used by an instruction. This is
// part of the ad-hoc solution used to emit machine instruction encodings by our
@@ -65,12 +75,13 @@ class ImmType<bits<4> val> {
def NoImm : ImmType<0>;
def Imm8 : ImmType<1>;
def Imm8PCRel : ImmType<2>;
-def Imm16 : ImmType<3>;
-def Imm16PCRel : ImmType<4>;
-def Imm32 : ImmType<5>;
-def Imm32PCRel : ImmType<6>;
-def Imm32S : ImmType<7>;
-def Imm64 : ImmType<8>;
+def Imm8Reg : ImmType<3>; // Register encoded in [7:4].
+def Imm16 : ImmType<4>;
+def Imm16PCRel : ImmType<5>;
+def Imm32 : ImmType<6>;
+def Imm32PCRel : ImmType<7>;
+def Imm32S : ImmType<8>;
+def Imm64 : ImmType<9>;
// FPFormat - This specifies what form this FP instruction has. This is used by
// the Floating-Point stackifier pass.
@@ -190,8 +201,6 @@ class TAXD : TA { Prefix OpPrefix = XD; }
class VEX { Encoding OpEnc = EncVEX; }
class VEX_W { bit hasVEX_WPrefix = 1; }
class VEX_4V : VEX { bit hasVEX_4V = 1; }
-class VEX_4VOp3 : VEX { bit hasVEX_4VOp3 = 1; }
-class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
class EVEX : VEX { Encoding OpEnc = EncEVEX; }
@@ -212,10 +221,8 @@ class EVEX_CD8<int esize, CD8VForm form> {
}
class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
-class MemOp4 { bit hasMemOp4Prefix = 1; }
class XOP { Encoding OpEnc = EncXOP; }
class XOP_4V : XOP { bit hasVEX_4V = 1; }
-class XOP_4VOp3 : XOP { bit hasVEX_4VOp3 = 1; }
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
string AsmStr,
@@ -265,10 +272,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bits<2> OpEncBits = OpEnc.Value;
bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field?
bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field?
- bit hasVEX_4VOp3 = 0; // Does this inst require the VEX.VVVV field to
- // encode the third operand?
- bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register
- // to be encoded in a immediate field?
bit hasVEX_L = 0; // Does this inst use large (256-bit) registers?
bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit
bit hasEVEX_K = 0; // Does this inst require masking?
@@ -280,7 +283,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// assigning to bits<7>.
int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes.
bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
- bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands
bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction.
bits<2> EVEX_LL;
@@ -317,19 +319,15 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{38-31} = Opcode;
let TSFlags{39} = hasVEX_WPrefix;
let TSFlags{40} = hasVEX_4V;
- let TSFlags{41} = hasVEX_4VOp3;
- let TSFlags{42} = hasVEX_i8ImmReg;
- let TSFlags{43} = hasVEX_L;
- let TSFlags{44} = ignoresVEX_L;
- let TSFlags{45} = hasEVEX_K;
- let TSFlags{46} = hasEVEX_Z;
- let TSFlags{47} = hasEVEX_L2;
- let TSFlags{48} = hasEVEX_B;
+ let TSFlags{41} = hasVEX_L;
+ let TSFlags{42} = hasEVEX_K;
+ let TSFlags{43} = hasEVEX_Z;
+ let TSFlags{44} = hasEVEX_L2;
+ let TSFlags{45} = hasEVEX_B;
// If we run out of TSFlags bits, it's possible to encode this in 3 bits.
- let TSFlags{55-49} = CD8_Scale;
- let TSFlags{56} = has3DNow0F0FOpcode;
- let TSFlags{57} = hasMemOp4Prefix;
- let TSFlags{58} = hasEVEX_RC;
+ let TSFlags{52-46} = CD8_Scale;
+ let TSFlags{53} = has3DNow0F0FOpcode;
+ let TSFlags{54} = hasEVEX_RC;
}
class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -351,6 +349,13 @@ class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
let Pattern = pattern;
let CodeSize = 3;
}
+class Ii8Reg<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary,
+ Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8Reg, outs, ins, asm, itin, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
@@ -785,7 +790,6 @@ class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
Requires<[HasAVX512]>;
class AVX512AIi8Base : TAPD {
- Domain ExeDomain = SSEPackedInt;
ImmType ImmT = Imm8;
}
class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -850,8 +854,8 @@ class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
// FMA4 Instruction Templates
class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin>, TAPD,
- VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ VEX_4V, FMASC, Requires<[HasFMA4]>;
// XOP 2, 3 and 4 Operand Instruction Template
class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -859,17 +863,22 @@ class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
XOP9, Requires<[HasXOP]>;
-// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
+// XOP 2 and 3 Operand Instruction Templates with imm byte
class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
XOP8, Requires<[HasXOP]>;
+// XOP 4 Operand Instruction Templates with imm byte
+class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern, InstrItinClass itin = NoItinerary>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+ XOP8, Requires<[HasXOP]>;
// XOP 5 operand instruction (VEX encoding!)
class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
- VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+ VEX_4V, Requires<[HasXOP]>;
// X86-64 Instruction templates...
//
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index ea54f04..c5689d7 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -29,7 +29,6 @@ def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
def load_mvmmx : PatFrag<(ops node:$ptr),
(x86mmx (MMX_X86movw2d (load node:$ptr)))>;
-def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>;
//===----------------------------------------------------------------------===//
// SSE specific DAG Nodes.
@@ -56,8 +55,7 @@ def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
-def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp,
- [SDNPCommutative, SDNPAssociative]>;
+def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>;
def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>;
@@ -67,16 +65,8 @@ def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
-def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>;
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
-def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>;
def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>;
-def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD",
- SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
- SDTCisVT<1, v4i32>]>>;
-def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD",
- SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>,
- SDTCisVT<1, v4i32>]>>;
def X86pshufb : SDNode<"X86ISD::PSHUFB",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
@@ -84,7 +74,7 @@ def X86psadbw : SDNode<"X86ISD::PSADBW",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
SDTCVecEltisVT<1, i8>,
SDTCisSameSizeAs<0,1>,
- SDTCisSameAs<1,2>]>>;
+ SDTCisSameAs<1,2>]>, [SDNPCommutative]>;
def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
SDTCVecEltisVT<1, i8>,
@@ -144,25 +134,14 @@ def X86vfpround: SDNode<"X86ISD::VFPROUND",
SDTCVecEltisVT<1, f64>,
SDTCisSameSizeAs<0, 1>]>>;
-def X86fround: SDNode<"X86ISD::VFPROUND",
- SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
- SDTCisSameAs<0, 1>,
- SDTCVecEltisVT<2, f64>,
- SDTCisSameSizeAs<0, 2>]>>;
-def X86froundRnd: SDNode<"X86ISD::VFPROUND",
+def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND",
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
SDTCisSameAs<0, 1>,
SDTCVecEltisVT<2, f64>,
SDTCisSameSizeAs<0, 2>,
SDTCisVT<3, i32>]>>;
-def X86fpext : SDNode<"X86ISD::VFPEXT",
- SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
- SDTCisSameAs<0, 1>,
- SDTCVecEltisVT<2, f32>,
- SDTCisSameSizeAs<0, 2>]>>;
-
-def X86fpextRnd : SDNode<"X86ISD::VFPEXT",
+def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND",
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>,
SDTCisSameAs<0, 1>,
SDTCVecEltisVT<2, f32>,
@@ -176,7 +155,8 @@ def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
def X86IntCmpMask : SDTypeProfile<1, 2,
- [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>;
+ [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisSameAs<1, 2>, SDTCisInt<1>,
+ SDTCisSameNumEltsAs<0, 1>]>;
def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
@@ -188,19 +168,19 @@ def X86CmpMaskCCRound :
SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>,
SDTCisVec<1>, SDTCisSameAs<2, 1>,
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>,
- SDTCisInt<4>]>;
+ SDTCisVT<4, i32>]>;
def X86CmpMaskCCScalar :
SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
def X86CmpMaskCCScalarRound :
SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>,
- SDTCisInt<4>]>;
+ SDTCisVT<4, i32>]>;
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
-def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>;
-def X86cmpmsRnd : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalarRound>;
+def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
+def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;
def X86vshl : SDNode<"X86ISD::VSHL",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -212,7 +192,9 @@ def X86vsra : SDNode<"X86ISD::VSRA",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisVec<2>]>>;
-def X86vsrav : SDNode<"X86ISD::VSRAV" , SDTIntShiftOp>;
+def X86vsrav : SDNode<"X86ISD::VSRAV" ,
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
@@ -261,12 +243,12 @@ def SDTX86Testm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>,
SDTCisSameNumEltsAs<0, 1>]>;
-def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp>;
+def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>;
def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
-def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp>;
+def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>;
def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>;
-def X86mulhrs : SDNode<"X86ISD::MULHRS" , SDTIntBinOp>;
-def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>;
+def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>;
+def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>;
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
@@ -283,7 +265,7 @@ def X86select : SDNode<"X86ISD::SELECT",
SDTCisSameAs<2, 3>,
SDTCisSameNumEltsAs<0, 1>]>>;
-def X86selects : SDNode<"X86ISD::SELECT",
+def X86selects : SDNode<"X86ISD::SELECTS",
SDTypeProfile<1, 3, [SDTCisVT<1, i1>,
SDTCisSameAs<0, 2>,
SDTCisSameAs<2, 3>]>>;
@@ -292,12 +274,14 @@ def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
SDTCVecEltisVT<1, i32>,
SDTCisSameSizeAs<0,1>,
- SDTCisSameAs<1,2>]>>;
+ SDTCisSameAs<1,2>]>,
+ [SDNPCommutative]>;
def X86pmuldq : SDNode<"X86ISD::PMULDQ",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
SDTCVecEltisVT<1, i32>,
SDTCisSameSizeAs<0,1>,
- SDTCisSameAs<1,2>]>>;
+ SDTCisSameAs<1,2>]>,
+ [SDNPCommutative]>;
def X86extrqi : SDNode<"X86ISD::EXTRQI",
SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
@@ -393,7 +377,7 @@ def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
-def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>;
+def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack, [SDNPCommutative]>;
def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
@@ -410,10 +394,12 @@ def X86VPermt2 : SDNode<"X86ISD::VPERMV3",
SDTCisSameSizeAs<0,2>,
SDTCisSameAs<0,3>]>, []>;
+// Even though the index operand should be integer, we need to make it match the
+// destination type so that we can pattern match the masked version where the
+// index is also the passthru operand.
def X86VPermi2X : SDNode<"X86ISD::VPERMIV3",
- SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
- SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
- SDTCisSameSizeAs<0,1>,
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
SDTCisSameAs<0,3>]>, []>;
@@ -462,9 +448,9 @@ def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>;
def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>;
def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>;
def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>;
-def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", SDTFPBinOpRound>;
+def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
-def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", SDTFPBinOpRound>;
+def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;
@@ -480,6 +466,18 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>;
def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>;
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>;
+// Scalar FMA intrinsics with passthru bits in operand 1.
+def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>;
+def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>;
+def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>;
+def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>;
+
+// Scalar FMA intrinsics with passthru bits in operand 3.
+def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound>;
+def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>;
+def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>;
+def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>;
+
def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTFma>;
def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTFma>;
@@ -487,11 +485,11 @@ def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>;
-def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", SDTFPBinOpRound>;
-def X86rcp28s : SDNode<"X86ISD::RCP28", SDTFPBinOpRound>;
-def X86RndScales : SDNode<"X86ISD::VRNDSCALE", SDTFPBinOpImmRound>;
-def X86Reduces : SDNode<"X86ISD::VREDUCE", SDTFPBinOpImmRound>;
-def X86GetMants : SDNode<"X86ISD::VGETMANT", SDTFPBinOpImmRound>;
+def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>;
+def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
+def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImmRound>;
+def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImmRound>;
def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -515,59 +513,69 @@ def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisFP<1>]>;
-
def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisFP<1>,
SDTCisVT<2, i32>]>;
def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
SDTCisVec<1>, SDTCisVT<2, i32>]>;
+
+def SDTVintToFP: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisFP<0>, SDTCisInt<1>]>;
def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisFP<0>, SDTCisInt<1>,
SDTCisVT<2, i32>]>;
// Scalar
-def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>;
-def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>;
+def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>;
+def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>;
-def X86cvtts2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSFloatToIntRnd>;
-def X86cvtts2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSFloatToIntRnd>;
+def X86cvtts2IntRnd : SDNode<"X86ISD::CVTTS2SI_RND", SDTSFloatToIntRnd>;
+def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND", SDTSFloatToIntRnd>;
-def X86cvts2si : SDNode<"X86ISD::SCALAR_FP_TO_SINT_RND", SDTSFloatToIntRnd>;
-def X86cvts2usi : SDNode<"X86ISD::SCALAR_FP_TO_UINT_RND", SDTSFloatToIntRnd>;
+def X86cvts2si : SDNode<"X86ISD::CVTS2SI_RND", SDTSFloatToIntRnd>;
+def X86cvts2usi : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>;
// Vector with rounding mode
// cvtt fp-to-int staff
-def X86VFpToSintRnd : SDNode<"ISD::FP_TO_SINT", SDTFloatToIntRnd>;
-def X86VFpToUintRnd : SDNode<"ISD::FP_TO_UINT", SDTFloatToIntRnd>;
+def X86cvttp2siRnd : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>;
+def X86cvttp2uiRnd : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>;
-def X86VSintToFpRnd : SDNode<"ISD::SINT_TO_FP", SDTVintToFPRound>;
-def X86VUintToFpRnd : SDNode<"ISD::UINT_TO_FP", SDTVintToFPRound>;
+def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>;
+def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>;
// cvt fp-to-int staff
-def X86cvtp2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToIntRnd>;
-def X86cvtp2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToIntRnd>;
+def X86cvtp2IntRnd : SDNode<"X86ISD::CVTP2SI_RND", SDTFloatToIntRnd>;
+def X86cvtp2UIntRnd : SDNode<"X86ISD::CVTP2UI_RND", SDTFloatToIntRnd>;
// Vector without rounding mode
-def X86cvtp2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToInt>;
-def X86cvtp2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>;
-def X86cvtph2ps : SDNode<"ISD::FP16_TO_FP",
+// cvtt fp-to-int staff
+def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt>;
+def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt>;
+
+def X86VSintToFP : SDNode<"X86ISD::CVTSI2P", SDTVintToFP>;
+def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>;
+
+// cvt int-to-fp staff
+def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
+def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>;
+
+def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, i16>,
SDTCisVT<2, i32>]> >;
-def X86cvtps2ph : SDNode<"ISD::FP_TO_FP16",
- SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH",
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
SDTCVecEltisVT<1, f32>,
- SDTCisVT<2, i32>,
- SDTCisVT<3, i32>]> >;
-def X86vfpextRnd : SDNode<"X86ISD::VFPEXT",
+ SDTCisVT<2, i32>]> >;
+def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
SDTCVecEltisVT<1, f32>,
SDTCisOpSmallerThanOp<1, 0>,
SDTCisVT<2, i32>]>>;
-def X86vfproundRnd: SDNode<"X86ISD::VFPROUND",
+def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, f64>,
SDTCisOpSmallerThanOp<0, 1>,
@@ -621,9 +629,6 @@ def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
// 512-bit load pattern fragments
def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
-def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>;
-def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
-def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
// 128-/256-/512-bit extload pattern fragments
@@ -631,15 +636,6 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
-// These are needed to match a scalar load that is used in a vector-only
-// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
-// The memory operand is required to be a 128-bit load, so it must be converted
-// from a vector to a scalar.
-def loadf32_128 : PatFrag<(ops node:$ptr),
- (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>;
-def loadf64_128 : PatFrag<(ops node:$ptr),
- (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>;
-
// Like 'store', but always requires 128-bit vector alignment.
def alignedstore : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
@@ -673,11 +669,6 @@ def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() >= 64;
}]>;
-def alignedloadfsf32 : PatFrag<(ops node:$ptr),
- (f32 (alignedload node:$ptr))>;
-def alignedloadfsf64 : PatFrag<(ops node:$ptr),
- (f64 (alignedload node:$ptr))>;
-
// 128-bit aligned load pattern fragments
// NOTE: all 128-bit integer vector loads are promoted to v2i64
def alignedloadv4f32 : PatFrag<(ops node:$ptr),
@@ -699,8 +690,6 @@ def alignedloadv4i64 : PatFrag<(ops node:$ptr),
// 512-bit aligned load pattern fragments
def alignedloadv16f32 : PatFrag<(ops node:$ptr),
(v16f32 (alignedload512 node:$ptr))>;
-def alignedloadv16i32 : PatFrag<(ops node:$ptr),
- (v16i32 (alignedload512 node:$ptr))>;
def alignedloadv8f64 : PatFrag<(ops node:$ptr),
(v8f64 (alignedload512 node:$ptr))>;
def alignedloadv8i64 : PatFrag<(ops node:$ptr),
@@ -717,9 +706,6 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
|| cast<LoadSDNode>(N)->getAlignment() >= 16;
}]>;
-def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
-def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
-
// 128-bit memop pattern fragments
// NOTE: all 128-bit integer vector loads are promoted to v2i64
def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
@@ -853,6 +839,7 @@ def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
// 512-bit bitconvert pattern fragments
+def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>;
def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
@@ -873,6 +860,10 @@ def fp32imm0 : PatLeaf<(f32 fpimm), [{
return N->isExactlyValue(+0.0);
}]>;
+def fp64imm0 : PatLeaf<(f64 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
def I8Imm : SDNodeXForm<imm, [{
// Transformation function: get the low 8 bits.
return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
@@ -940,30 +931,36 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
return X86::isVINSERT256Index(N);
}], INSERT_get_vinsert256_imm>;
-def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_load node:$src1, node:$src2, node:$src3), [{
- if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
- return Load->getAlignment() >= 16;
- return false;
+ return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86mload node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16;
}]>;
def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_load node:$src1, node:$src2, node:$src3), [{
- if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
- return Load->getAlignment() >= 32;
- return false;
+ (X86mload node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32;
}]>;
def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_load node:$src1, node:$src2, node:$src3), [{
- if (auto *Load = dyn_cast<MaskedLoadSDNode>(N))
- return Load->getAlignment() >= 64;
- return false;
+ (X86mload node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64;
}]>;
def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_load node:$src1, node:$src2, node:$src3), [{
- return isa<MaskedLoadSDNode>(N);
+ return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_load node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->isExpandingLoad();
}]>;
// Masked store fragments.
@@ -971,33 +968,34 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
// do not support vector types (llvm-tblgen will fail).
def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_store node:$src1, node:$src2, node:$src3), [{
- return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+ return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
+ (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
}]>;
def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86mstore node:$src1, node:$src2, node:$src3), [{
- if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
- return Store->getAlignment() >= 16;
- return false;
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16;
}]>;
def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86mstore node:$src1, node:$src2, node:$src3), [{
- if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
- return Store->getAlignment() >= 32;
- return false;
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32;
}]>;
def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86mstore node:$src1, node:$src2, node:$src3), [{
- if (auto *Store = dyn_cast<MaskedStoreSDNode>(N))
- return Store->getAlignment() >= 64;
- return false;
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64;
}]>;
def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86mstore node:$src1, node:$src2, node:$src3), [{
- return isa<MaskedStoreSDNode>(N);
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
+ (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+}]>;
+
+def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (masked_store node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
// masked truncstore fragments
@@ -1022,3 +1020,80 @@ def masked_truncstorevi32 :
(X86mtruncstore node:$src1, node:$src2, node:$src3), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
+
+def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTMaskedStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTMaskedStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncSStore node:$val, node:$ptr), [{
+ return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86TruncUSStore node:$val, node:$ptr), [{
+ return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+ return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def assertzext_i1 :
+ PatFrag<(ops node:$src), (assertzext node:$src), [{
+ return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
+}]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5f0aab9..627b612 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -68,7 +68,7 @@ static cl::opt<unsigned>
UndefRegClearance("undef-reg-clearance",
cl::desc("How many idle instructions we would like before "
"certain undef register reads"),
- cl::init(64), cl::Hidden);
+ cl::init(128), cl::Hidden);
enum {
// Select which memory operand is being unfolded.
@@ -228,12 +228,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::SBB64ri32, X86::SBB64mi32, 0 },
{ X86::SBB64ri8, X86::SBB64mi8, 0 },
{ X86::SBB64rr, X86::SBB64mr, 0 },
+ { X86::SHL16r1, X86::SHL16m1, 0 },
{ X86::SHL16rCL, X86::SHL16mCL, 0 },
{ X86::SHL16ri, X86::SHL16mi, 0 },
+ { X86::SHL32r1, X86::SHL32m1, 0 },
{ X86::SHL32rCL, X86::SHL32mCL, 0 },
{ X86::SHL32ri, X86::SHL32mi, 0 },
+ { X86::SHL64r1, X86::SHL64m1, 0 },
{ X86::SHL64rCL, X86::SHL64mCL, 0 },
{ X86::SHL64ri, X86::SHL64mi, 0 },
+ { X86::SHL8r1, X86::SHL8m1, 0 },
{ X86::SHL8rCL, X86::SHL8mCL, 0 },
{ X86::SHL8ri, X86::SHL8mi, 0 },
{ X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
@@ -335,6 +339,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
{ X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
{ X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
@@ -380,6 +385,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
{ X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
{ X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
@@ -394,10 +400,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions
+ { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
+ { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
{ X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
@@ -409,8 +425,27 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
+ { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions (256-bit versions)
+ { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
+ { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -421,6 +456,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
+ { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
+ { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },
// AVX-512 foldable instructions (128-bit versions)
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -471,26 +515,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
{ X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
{ X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
- { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 },
- { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 },
- { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 },
- { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
- { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
- { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
- { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 },
+ { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE },
+ { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE },
+ { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE },
+ { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE },
+ { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE },
+ { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE },
+ { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
{ X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
{ X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
{ X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
- { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 },
+ { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
{ X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
- { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
- { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
- { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
- { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
- { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 },
- { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 },
+ { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE },
+ { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE },
+ { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE },
+ { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE },
+ { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE },
{ X86::MOV16rr, X86::MOV16rm, 0 },
{ X86::MOV32rr, X86::MOV32rm, 0 },
{ X86::MOV64rr, X86::MOV64rm, 0 },
@@ -499,10 +543,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
- { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
+ { X86::MOVDQUrr, X86::MOVDQUrm, 0 },
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
{ X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
{ X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
@@ -511,51 +556,53 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
{ X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
- { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 },
+ { X86::MOVUPDrr, X86::MOVUPDrm, 0 },
{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
- { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },
+ { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
{ X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
{ X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
- { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },
- { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },
- { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },
+ { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
+ { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
+ { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
{ X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
{ X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
{ X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
{ X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
{ X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
- { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 },
- { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 },
- { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 },
- { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 },
- { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 },
- { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 },
- { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 },
- { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 },
- { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 },
- { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 },
- { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 },
- { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 },
+ { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
+ { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
+ { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
+ { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
+ { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
+ { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
+ { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
+ { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
+ { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
+ { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
+ { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
{ X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
{ X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
{ X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
{ X86::RCPSSr, X86::RCPSSm, 0 },
- { X86::RCPSSr_Int, X86::RCPSSm_Int, 0 },
+ { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
- { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
{ X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
{ X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
{ X86::SQRTSDr, X86::SQRTSDm, 0 },
- { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
{ X86::SQRTSSr, X86::SQRTSSm, 0 },
- { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
{ X86::TEST16rr, X86::TEST16rm, 0 },
{ X86::TEST32rr, X86::TEST32rm, 0 },
{ X86::TEST64rr, X86::TEST64rm, 0 },
@@ -586,46 +633,47 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PSWAPDrr, X86::PSWAPDrm, 0 },
// AVX 128-bit versions of foldable instructions
- { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 },
- { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 },
- { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 },
- { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 },
+ { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE },
+ { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE },
+ { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE },
{ X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
- { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+ { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
{ X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
- { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 },
+ { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE },
{ X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
- { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+ { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
{ X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
- { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 },
- { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
- { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
- { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
- { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
- { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 },
+ { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE },
+ { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE },
+ { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE },
+ { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
- { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 },
- { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 },
+ { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
+ { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
{ X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
- { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 },
- { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 },
+ { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
{ X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
{ X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
{ X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
{ X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
- { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
{ X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
{ X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
+ { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
{ X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
- { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
- { X86::VPABSBrr128, X86::VPABSBrm128, 0 },
- { X86::VPABSDrr128, X86::VPABSDrm128, 0 },
- { X86::VPABSWrr128, X86::VPABSWrm128, 0 },
+ { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE },
+ { X86::VPABSBrr, X86::VPABSBrm, 0 },
+ { X86::VPABSDrr, X86::VPABSDrm, 0 },
+ { X86::VPABSWrr, X86::VPABSWrm, 0 },
{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
{ X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
{ X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
@@ -633,18 +681,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
- { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 },
- { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 },
- { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 },
- { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 },
- { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 },
- { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 },
- { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 },
- { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 },
- { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 },
- { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 },
- { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 },
- { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 },
+ { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
{ X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
{ X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
@@ -661,18 +709,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
// AVX 256-bit foldable instructions
- { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
+ { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
{ X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
{ X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
- { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
+ { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE },
{ X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
{ X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
{ X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
{ X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
+ { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
{ X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
{ X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
{ X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
@@ -699,31 +748,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
- { X86::VPABSBrr256, X86::VPABSBrm256, 0 },
- { X86::VPABSDrr256, X86::VPABSDrm256, 0 },
- { X86::VPABSWrr256, X86::VPABSWrm256, 0 },
- { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 },
- { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 },
- { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 },
- { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 },
- { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 },
- { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 },
- { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 },
- { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 },
+ { X86::VPABSBYrr, X86::VPABSBYrm, 0 },
+ { X86::VPABSDYrr, X86::VPABSDYrm, 0 },
+ { X86::VPABSWYrr, X86::VPABSWYrm, 0 },
+ { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
{ X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
{ X86::VPERMQYri, X86::VPERMQYmi, 0 },
- { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 },
- { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 },
+ { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
{ X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
{ X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
{ X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
- { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 },
- { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 },
- { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 },
+ { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
{ X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
{ X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
{ X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
- { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 },
+ { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
{ X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
{ X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
{ X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
@@ -817,7 +866,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::TZMSK64rr, X86::TZMSK64rm, 0 },
// AVX-512 foldable instructions
+ { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE },
{ X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
+ { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
{ X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
{ X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
@@ -831,12 +885,31 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
{ X86::VPABSDZrr, X86::VPABSDZrm, 0 },
{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
- { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
- { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE },
- { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
- { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
+ { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
+ { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMQZri, X86::VPERMQZmi, 0 },
+ { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
+ { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
+ { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
+ { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
+ { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
+ { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
+ { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
+ { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
+ { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
+ { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
+ { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
+ { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
+ { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
// AVX-512 foldable instructions (256-bit versions)
+ { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
@@ -847,12 +920,29 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
- { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+ { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
+ { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
+ { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
+ { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
+ { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
+ { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
+ { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
+ { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
+ { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
+ { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
+ { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
+ { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
+ { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
// AVX-512 foldable instructions (128-bit versions)
+ { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
@@ -863,8 +953,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
- { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+ { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
+ { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
+ { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
+ { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
+ { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },
+
// F16C foldable instructions
{ X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
{ X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
@@ -896,9 +1002,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
{ X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
{ X86::ADDSDrr, X86::ADDSDrm, 0 },
- { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 },
+ { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
{ X86::ADDSSrr, X86::ADDSSrm, 0 },
- { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 },
+ { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
{ X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
{ X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
{ X86::AND16rr, X86::AND16rm, 0 },
@@ -970,24 +1076,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
{ X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
{ X86::DIVSDrr, X86::DIVSDrm, 0 },
- { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 },
+ { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
{ X86::DIVSSrr, X86::DIVSSrm, 0 },
- { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 },
+ { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
-
- // Do not fold Fs* scalar logical op loads because there are no scalar
- // load variants for these instructions. When folded, the load is required
- // to be 128-bits, so the load size would not match.
-
- { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
- { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
- { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 },
- { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 },
- { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 },
- { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 },
- { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 },
- { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 },
{ X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
{ X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
{ X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
@@ -995,34 +1088,42 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::IMUL16rr, X86::IMUL16rm, 0 },
{ X86::IMUL32rr, X86::IMUL32rm, 0 },
{ X86::IMUL64rr, X86::IMUL64rm, 0 },
- { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 },
- { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 },
- { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 },
+ { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE },
+ { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE },
+ { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE },
{ X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
{ X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
{ X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
{ X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
- { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 },
+ { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE },
{ X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
+ { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
{ X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
+ { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
{ X86::MAXSDrr, X86::MAXSDrm, 0 },
- { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 },
+ { X86::MAXCSDrr, X86::MAXCSDrm, 0 },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
{ X86::MAXSSrr, X86::MAXSSrm, 0 },
- { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },
+ { X86::MAXCSSrr, X86::MAXCSSrm, 0 },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
+ { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
+ { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
{ X86::MINSDrr, X86::MINSDrm, 0 },
- { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
+ { X86::MINCSDrr, X86::MINCSDrm, 0 },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
{ X86::MINSSrr, X86::MINSSrm, 0 },
- { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
+ { X86::MINCSSrr, X86::MINCSSrm, 0 },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
{ X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
{ X86::MULSDrr, X86::MULSDrm, 0 },
- { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 },
+ { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
{ X86::MULSSrr, X86::MULSSrm, 0 },
- { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 },
+ { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
{ X86::OR16rr, X86::OR16rm, 0 },
{ X86::OR32rr, X86::OR32rm, 0 },
{ X86::OR64rr, X86::OR64rm, 0 },
@@ -1067,7 +1168,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PINSRDrr, X86::PINSRDrm, 0 },
{ X86::PINSRQrr, X86::PINSRQrm, 0 },
{ X86::PINSRWrri, X86::PINSRWrmi, 0 },
- { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },
+ { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
{ X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
{ X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
@@ -1082,7 +1183,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
{ X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
{ X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
- { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 },
+ { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
{ X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
{ X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
{ X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
@@ -1119,8 +1220,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
- { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
- { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
+ { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
+ { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
{ X86::SBB32rr, X86::SBB32rm, 0 },
{ X86::SBB64rr, X86::SBB64rm, 0 },
{ X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
@@ -1132,9 +1233,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
{ X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
{ X86::SUBSDrr, X86::SUBSDrm, 0 },
- { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 },
+ { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
{ X86::SUBSSrr, X86::SUBSSrm, 0 },
- { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 },
+ { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
// FIXME: TEST*rr -> swapped operand of TEST*mr.
{ X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
{ X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
@@ -1240,7 +1341,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// AVX 128-bit versions of foldable instructions
{ X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
- { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 },
+ { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, TB_NO_REVERSE },
{ X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
{ X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
{ X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
@@ -1250,21 +1351,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
{ X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
- { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
- { X86::VRCPSSr, X86::VRCPSSm, 0 },
- { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 },
- { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
- { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 },
- { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
- { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 },
- { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
- { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 },
+ { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, TB_NO_REVERSE },
{ X86::VADDPDrr, X86::VADDPDrm, 0 },
{ X86::VADDPSrr, X86::VADDPSrm, 0 },
{ X86::VADDSDrr, X86::VADDSDrm, 0 },
- { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 },
+ { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
{ X86::VADDSSrr, X86::VADDSSrm, 0 },
- { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 },
+ { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
{ X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
{ X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
{ X86::VANDNPDrr, X86::VANDNPDrm, 0 },
@@ -1282,48 +1375,45 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VDIVPDrr, X86::VDIVPDrm, 0 },
{ X86::VDIVPSrr, X86::VDIVPSrm, 0 },
{ X86::VDIVSDrr, X86::VDIVSDrm, 0 },
- { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 },
+ { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
{ X86::VDIVSSrr, X86::VDIVSSrm, 0 },
- { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 },
+ { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
{ X86::VDPPDrri, X86::VDPPDrmi, 0 },
{ X86::VDPPSrri, X86::VDPPSrmi, 0 },
- // Do not fold VFs* loads because there are no scalar load variants for
- // these instructions. When folded, the load is required to be 128-bits, so
- // the load size would not match.
- { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 },
- { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 },
- { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 },
- { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 },
- { X86::VFvORPDrr, X86::VFvORPDrm, 0 },
- { X86::VFvORPSrr, X86::VFvORPSrm, 0 },
- { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 },
- { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 },
{ X86::VHADDPDrr, X86::VHADDPDrm, 0 },
{ X86::VHADDPSrr, X86::VHADDPSrm, 0 },
{ X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
{ X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
- { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 },
- { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 },
+ { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE },
+ { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE },
+ { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
+ { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
+ { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
+ { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
{ X86::VMAXPDrr, X86::VMAXPDrm, 0 },
{ X86::VMAXPSrr, X86::VMAXPSrm, 0 },
{ X86::VMAXSDrr, X86::VMAXSDrm, 0 },
- { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 },
+ { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },
- { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 },
+ { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
+ { X86::VMINCPDrr, X86::VMINCPDrm, 0 },
+ { X86::VMINCPSrr, X86::VMINCPSrm, 0 },
+ { X86::VMINCSDrr, X86::VMINCSDrm, 0 },
+ { X86::VMINCSSrr, X86::VMINCSSrm, 0 },
{ X86::VMINPDrr, X86::VMINPDrm, 0 },
{ X86::VMINPSrr, X86::VMINPSrm, 0 },
{ X86::VMINSDrr, X86::VMINSDrm, 0 },
- { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },
+ { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
{ X86::VMINSSrr, X86::VMINSSrm, 0 },
- { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },
+ { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
{ X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
{ X86::VMULPDrr, X86::VMULPDrm, 0 },
{ X86::VMULPSrr, X86::VMULPSrm, 0 },
{ X86::VMULSDrr, X86::VMULSDrm, 0 },
- { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 },
+ { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
{ X86::VMULSSrr, X86::VMULSSrm, 0 },
- { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 },
+ { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
{ X86::VORPDrr, X86::VORPDrm, 0 },
{ X86::VORPSrr, X86::VORPSrm, 0 },
{ X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
@@ -1366,7 +1456,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPINSRDrr, X86::VPINSRDrm, 0 },
{ X86::VPINSRQrr, X86::VPINSRQrm, 0 },
{ X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
- { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },
+ { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
{ X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
{ X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
{ X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
@@ -1381,7 +1471,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
{ X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
{ X86::VPMULDQrr, X86::VPMULDQrm, 0 },
- { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 },
+ { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
{ X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
{ X86::VPMULHWrr, X86::VPMULHWrm, 0 },
{ X86::VPMULLDrr, X86::VPMULLDrm, 0 },
@@ -1418,16 +1508,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
{ X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
{ X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VRCPSSr, X86::VRCPSSm, 0 },
+ { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
+ { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
+ { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
{ X86::VROUNDSDr, X86::VROUNDSDm, 0 },
+ { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
{ X86::VROUNDSSr, X86::VROUNDSSm, 0 },
+ { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
{ X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
{ X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
+ { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
+ { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
+ { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
+ { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
{ X86::VSUBPDrr, X86::VSUBPDrm, 0 },
{ X86::VSUBPSrr, X86::VSUBPSrm, 0 },
{ X86::VSUBSDrr, X86::VSUBSDrm, 0 },
- { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 },
+ { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
{ X86::VSUBSSrr, X86::VSUBSSrm, 0 },
- { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 },
+ { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
{ X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
{ X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
{ X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
@@ -1458,8 +1558,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
{ X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
{ X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
+ { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
+ { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
{ X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
{ X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
+ { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
+ { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
{ X86::VMINPDYrr, X86::VMINPDYrm, 0 },
{ X86::VMINPSYrr, X86::VMINPSYrm, 0 },
{ X86::VMULPDYrr, X86::VMULPDYrm, 0 },
@@ -1520,7 +1624,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
{ X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
{ X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
- { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 },
+ { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
{ X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
{ X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
{ X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
@@ -1536,7 +1640,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
{ X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
{ X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
- { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 },
+ { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
{ X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
{ X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
{ X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
@@ -1559,8 +1663,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
{ X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
{ X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
- { X86::VPSRAVD_Intrr, X86::VPSRAVD_Intrm, 0 },
- { X86::VPSRAVD_IntYrr, X86::VPSRAVD_IntYrm, 0 },
{ X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
{ X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
{ X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
@@ -1588,37 +1690,45 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
{ X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
{ X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
{ X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
- { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE },
- { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE },
{ X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
{ X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
- { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE },
- { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE },
- { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
@@ -1678,38 +1788,107 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::ADOX64rr, X86::ADOX64rm, 0 },
// AVX-512 foldable instructions
- { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
{ X86::VADDPDZrr, X86::VADDPDZrm, 0 },
- { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
- { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, 0 },
+ { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
{ X86::VADDSDZrr, X86::VADDSDZrm, 0 },
- { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, 0 },
- { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
- { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
- { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
- { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, 0 },
- { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
- { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, 0 },
- { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
- { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
- { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
- { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, 0 },
- { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
- { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, 0 },
- { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
+ { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
+ { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
+ { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
+ { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
+ { X86::VANDPDZrr, X86::VANDPDZrm, 0 },
+ { X86::VANDPSZrr, X86::VANDPSZrm, 0 },
+ { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
+ { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
+ { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
+ { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
+ { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
+ { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
+ { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
{ X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
- { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
- { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, 0 },
+ { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
{ X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
- { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, 0 },
- { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
- { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
- { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
+ { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
+ { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
+ { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
+ { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
+ { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
+ { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
+ { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
+ { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
+ { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
+ { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
+ { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
+ { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
+ { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
+ { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
+ { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
{ X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
+ { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
+ { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
+ { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
+ { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
+ { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
+ { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
+ { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
+ { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
+ { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
+ { X86::VMINSDZrr, X86::VMINSDZrm, 0 },
+ { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMINSSZrr, X86::VMINSSZrm, 0 },
+ { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
+ { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
+ { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
+ { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
+ { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
+ { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
+ { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
+ { X86::VORPDZrr, X86::VORPDZrm, 0 },
+ { X86::VORPSZrr, X86::VORPSZrm, 0 },
+ { X86::VPADDBZrr, X86::VPADDBZrm, 0 },
{ X86::VPADDDZrr, X86::VPADDDZrm, 0 },
{ X86::VPADDQZrr, X86::VPADDQZrm, 0 },
- { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
+ { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
+ { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
+ { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
+ { X86::VPADDWZrr, X86::VPADDWZrm, 0 },
+ { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
+ { X86::VPANDDZrr, X86::VPANDDZrm, 0 },
+ { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
+ { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
+ { X86::VPANDQZrr, X86::VPANDQZrm, 0 },
+ { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
+ { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
+ { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
+ { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
+ { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
+ { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
+ { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
+ { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
+ { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
+ { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
+ { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
+ { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
+ { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
+ { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
+ { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
+ { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
+ { X86::VPERMBZrr, X86::VPERMBZrm, 0 },
+ { X86::VPERMDZrr, X86::VPERMDZrm, 0 },
+ { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
+ { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
+ { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
{ X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
+ { X86::VPERMQZrr, X86::VPERMQZrm, 0 },
+ { X86::VPERMWZrr, X86::VPERMWZrm, 0 },
+ { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
+ { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
{ X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
{ X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
{ X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
@@ -1719,31 +1898,297 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
{ X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
{ X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
+ { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
+ { X86::VPORDZrr, X86::VPORDZrm, 0 },
+ { X86::VPORQZrr, X86::VPORQZrm, 0 },
+ { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
{ X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
{ X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
{ X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
{ X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
{ X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
+ { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
{ X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
+ { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
+ { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
+ { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
+ { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
+ { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
+ { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
+ { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
+ { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
+ { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
+ { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
+ { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
+ { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
+ { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
+ { X86::VPXORDZrr, X86::VPXORDZrm, 0 },
+ { X86::VPXORQZrr, X86::VPXORQZrm, 0 },
{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
- { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
- { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
- { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
- { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
- { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
-
- // AVX-512{F,VL} foldable instructions
- { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
+ { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
+ { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
+ { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
+ { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
+ { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
+ { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
+ { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
+ { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
+ { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
+ { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
+ { X86::VXORPDZrr, X86::VXORPDZrm, 0 },
+ { X86::VXORPSZrr, X86::VXORPSZrm, 0 },
// AVX-512{F,VL} foldable instructions
{ X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
{ X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
{ X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
{ X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
+ { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
+ { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
+ { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
+ { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
+ { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
+ { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
+ { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
+ { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
+ { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
+ { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
+ { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
+ { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
+ { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
+ { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
+ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
+ { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
+ { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
+ { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 },
+ { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 },
+ { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 },
+ { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 },
+ { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
+ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
+ { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
+ { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
+ { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
+ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
+ { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
+ { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
+ { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
+ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
+ { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
+ { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
+ { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
+ { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
+ { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
+ { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
+ { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
+ { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
+ { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
+ { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
+ { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
+ { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
+ { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
+ { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
+ { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
+ { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
+ { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
+ { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
+ { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
+ { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
+ { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
+ { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
+ { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
+ { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
+ { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
+ { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
+ { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
+ { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
+ { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
+ { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
+ { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
+ { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
+ { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
+ { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
+ { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
+ { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
+ { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
+ { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
+ { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
+ { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
+ { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
+ { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
+ { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
+ { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
+ { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
+ { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
+ { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
+ { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
+ { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
+ { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
+ { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
+ { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
+ { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
+ { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
+ { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
+ { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
+ { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
+ { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
+ { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
+ { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
+ { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
+ { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
+ { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
+ { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
+ { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
+ { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
+ { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
+ { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
+ { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
+ { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
+ { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
+ { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
+ { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
+ { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
+ { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
+ { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
+ { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
+ { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
+ { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
+ { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
+ { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
+ { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
+ { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
+ { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
+ { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
+ { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
+ { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
+ { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
+ { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
+ { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
+ { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
+ { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
+ { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
+ { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
+ { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
+ { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
+ { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
+ { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
+ { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
+ { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
+ { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
+ { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
+ { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
+ { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
+ { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
+ { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
+ { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
+ { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
+ { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
+ { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
+ { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
+ { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
+ { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
+ { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
+ { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
+ { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
+ { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
+ { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
+ { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
+ { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
+ { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
+ { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
+
+ // AVX-512 masked foldable instructions
+ { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
+ { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
+ { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
+ { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
+ { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
+ { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
+ { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
+ { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
+ { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
+ { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
+ { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
+ { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
+ { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
+ { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
+ { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
+ { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
+ { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
+
+ // AVX-512VL 256-bit masked foldable instructions
+ { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
+ { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
+ { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
+ { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
+ { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
+ { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
+ { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
+ { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
+ { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
+ { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
+ { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
+ { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
+ { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },
+
+ // AVX-512VL 128-bit masked foldable instructions
+ { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
+ { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
+ { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
+ { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
+ { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },
// AES foldable instructions
{ X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
@@ -1773,170 +2218,47 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
}
static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
- // FMA foldable instructions
- { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE },
- { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE },
- { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE },
- { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE },
- { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE },
- { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE },
- { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE },
- { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE },
-
- { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE },
- { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE },
- { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE },
- { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE },
- { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE },
- { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE },
- { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE },
- { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE },
- { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE },
- { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE },
- { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE },
- { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE },
- { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE },
- { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE },
- { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE },
- { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE },
- { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE },
- { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE },
- { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE },
-
- { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE },
- { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE },
- { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE },
- { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE },
- { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE },
- { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE },
- { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE },
- { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE },
- { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE },
- { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE },
- { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE },
- { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE },
- { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE },
- { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE },
- { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE },
- { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE },
- { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE },
- { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE },
- { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE },
-
- { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE },
- { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE },
- { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE },
- { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE },
- { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE },
- { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE },
- { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE },
- { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE },
- { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE },
- { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE },
- { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE },
- { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE },
- { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE },
- { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE },
-
- { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE },
- { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE },
-
- { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE },
-
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
{ X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
{ X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
{ X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
- { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE },
- { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE },
{ X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
{ X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
- { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE },
- { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE },
- { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
@@ -1947,11 +2269,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 },
{ X86::VPPERMrrr, X86::VPPERMrrm, 0 },
- // AVX-512 VPERMI instructions with 3 source operands.
- { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
- { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
- { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
- { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
+ // AVX-512 instructions with 3 source operands.
{ X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
{ X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
{ X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
@@ -1961,45 +2279,349 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
{ X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
{ X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
- // AVX-512 arithmetic instructions
- { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
+ { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
+ { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
+ { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
+ { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
+ { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
+ { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
+ { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
+ { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
+ { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
+ { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
+ { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
+ { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
+ { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
+ { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
+
+ // AVX-512VL 256-bit instructions with 3 source operands.
+ { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
+ { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
+ { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
+ { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
+ { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
+ { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
+ { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
+ { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
+ { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
+ { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
+ { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
+ { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
+ { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
+ { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
+
+ // AVX-512VL 128-bit instructions with 3 source operands.
+ { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
+ { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
+ { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
+ { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
+ { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
+ { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
+ { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
+ { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
+ { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
+ { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
+ { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
+ { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
+ { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
+ { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
+
+ // AVX-512 masked instructions
{ X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
- { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
- { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
- { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
- { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
- { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
+ { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
+ { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
+ { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
+ { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
+ { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
+ { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
+ { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
{ X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
- { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
- { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
- { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
+ { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
+ { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
+ { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
+ { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
+ { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
+ { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
+ { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
+ { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
+ { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
+ { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
+ { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
{ X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
- // AVX-512{F,VL} arithmetic instructions 256-bit
- { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
+ { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
+ { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
+ { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
+ { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
+ { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
+ { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
+ { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
+ { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
+ { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
+ { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
+ { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
+ { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
+ { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
+ { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
+ { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
+ { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
+ { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
+ { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
+ { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
+ { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
+ { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
+ { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
+ { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
+ { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
+ { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
+ { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
+ { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
+ { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
+ { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
+ { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
+ { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
+ { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
+ { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
+ { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
+ { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
+ { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
+ { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
+ { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
+ { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
+ { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
+ { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
+ { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
+ { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
+ { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
+ { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
+ { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
+ { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
+ { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
+ { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
+ { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
+ { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
+ { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
+ { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
+ { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
+ { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
+ { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
+ { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
+ { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
+ { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
+ { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
+ { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },
+
+ // AVX-512{F,VL} masked arithmetic instructions 256-bit
{ X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
- { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
- { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
- { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
- { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
- { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
+ { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
+ { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
+ { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
+ { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
+ { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
+ { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
{ X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
- { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
- { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
- { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
+ { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
+ { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
+ { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
+ { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
+ { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
+ { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
+ { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
{ X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
- // AVX-512{F,VL} arithmetic instructions 128-bit
- { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
+ { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
+ { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
+ { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
+ { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
+ { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
+ { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
+ { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
+ { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
+ { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
+ { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
+ { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
+ { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
+ { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
+ { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
+ { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
+ { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
+ { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
+ { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
+ { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
+ { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
+ { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
+ { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
+ { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
+ { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
+ { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
+ { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
+ { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
+ { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
+ { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
+ { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
+ { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
+ { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
+ { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
+ { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
+ { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
+ { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
+ { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
+ { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
+ { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
+ { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
+ { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
+ { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
+ { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
+ { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
+ { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
+ { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
+ { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
+ { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
+ { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
+ { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
+ { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
+ { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
+ { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
+ { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
+ { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
+ { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
+ { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
+ { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
+ { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
+ { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
+ { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },
+
+ // AVX-512{F,VL} masked arithmetic instructions 128-bit
{ X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
- { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
- { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
- { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
- { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
- { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
+ { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
+ { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
+ { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
+ { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
+ { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
+ { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
{ X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
- { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
- { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
+ { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
+ { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
+ { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
+ { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
{ X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
- { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }
+ { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
+ { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
+ { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
+ { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
+ { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
+ { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
+ { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
+ { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
+ { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
+ { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
+ { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
+ { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
+ { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
+ { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
+ { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
+ { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
+ { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
+ { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
+ { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
+ { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
+ { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
+ { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
+ { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
+ { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
+ { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
+ { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
+ { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
+ { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
+ { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
+ { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
+ { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
+ { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
+ { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
+ { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
+ { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
+ { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
+ { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
+ { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
+ { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
+ { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
+ { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
+ { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
+ { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
+ { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
+ { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
+ { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
+ { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
+ { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
+ { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
+ { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
+ { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
+ { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
+ { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
+ { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
+ { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
+ { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
+
+ // AVX-512 masked foldable instructions
+ { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
+ { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
+ { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
+ { X86::VPERMQZrik, X86::VPERMQZmik, 0 },
+ { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
+ { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
+ { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
+ { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
+ { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
+ { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
+ { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
+ { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
+ { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
+ { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
+ { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
+ { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
+ { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
+
+ // AVX-512VL 256-bit masked foldable instructions
+ { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
+ { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
+ { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
+ { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
+ { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
+ { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
+ { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
+ { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
+ { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
+ { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
+ { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
+ { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
+ { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
+ { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },
+
+ // AVX-512VL 128-bit masked foldable instructions
+ { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
+ { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
+ { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
+ { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
+ { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
+ { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
+ { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
@@ -2008,47 +2630,348 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// Index 3, folded load
Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
}
+ auto I = X86InstrFMA3Info::rm_begin();
+ auto E = X86InstrFMA3Info::rm_end();
+ for (; I != E; ++I) {
+ if (!I.getGroup()->isKMasked()) {
+ // Intrinsic forms need to pass TB_NO_REVERSE.
+ if (I.getGroup()->isIntrinsic()) {
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
+ } else {
+ AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
+ }
+ }
+ }
static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
- // AVX-512 foldable instructions
- { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
+ // AVX-512 foldable masked instructions
{ X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
- { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
- { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
- { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
- { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
- { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
+ { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
+ { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
+ { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
+ { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
+ { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
+ { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
+ { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
{ X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
- { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
- { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
- { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
+ { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
+ { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
+ { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
+ { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
+ { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
+ { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
+ { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
+ { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
+ { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
+ { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
+ { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
{ X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
- // AVX-512{F,VL} foldable instructions 256-bit
- { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
+ { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
+ { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
+ { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
+ { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
+ { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
+ { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
+ { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
+ { X86::VORPDZrrk, X86::VORPDZrmk, 0 },
+ { X86::VORPSZrrk, X86::VORPSZrmk, 0 },
+ { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
+ { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
+ { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
+ { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
+ { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
+ { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
+ { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
+ { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
+ { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
+ { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
+ { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
+ { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
+ { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
+ { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
+ { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
+ { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
+ { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
+ { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
+ { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
+ { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
+ { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
+ { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
+ { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
+ { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
+ { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
+ { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
+ { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
+ { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
+ { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
+ { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
+ { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
+ { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
+ { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
+ { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
+ { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
+ { X86::VPORDZrrk, X86::VPORDZrmk, 0 },
+ { X86::VPORQZrrk, X86::VPORQZrmk, 0 },
+ { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
+ { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
+ { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
+ { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
+ { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
+ { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
+ { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
+ { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
+ { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
+ { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
+ { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
+ { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
+ { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
+ { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
+ { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
+ { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
+ { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
+ { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
+ { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
+ { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
+ { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
+ { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
+ { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
+ { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
+ { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
+ { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
+ { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
+ { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
+
+ // AVX-512{F,VL} foldable masked instructions 256-bit
{ X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
- { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
- { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
- { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
- { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
- { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
+ { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
+ { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
+ { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
+ { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
+ { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
+ { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
+ { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
{ X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
- { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
- { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
- { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
+ { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
+ { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 },
+ { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 },
+ { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 },
+ { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 },
+ { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
+ { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
{ X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
+ { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
+ { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
+ { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
+ { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
+ { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
+ { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
+ { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
+ { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
+ { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
+ { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
+ { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
+ { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
+ { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
+ { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
+ { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
+ { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
+ { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
+ { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
+ { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
+ { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
+ { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
+ { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
+ { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
+ { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
+ { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
+ { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
+ { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
+ { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
+ { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
+ { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
+ { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
+ { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
+ { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
+ { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
+ { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
+ { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
+ { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
+ { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
+ { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
+ { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
+ { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
+ { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
+ { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
+ { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
+ { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
+ { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
+ { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
+ { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
+ { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
+ { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
+ { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
+ { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
+ { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
+ { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
+ { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
+ { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
+ { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
+ { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
+ { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
+ { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
+ { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
+ { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
+ { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
+ { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
+ { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
+ { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
+ { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
+ { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
+ { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
+ { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
+ { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
+ { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
+ { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
+ { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
+ { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },
+
// AVX-512{F,VL} foldable instructions 128-bit
- { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
{ X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
- { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
- { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
- { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
- { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
- { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
+ { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
+ { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
+ { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
+ { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
+ { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
+ { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
+ { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
{ X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
- { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
- { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
+ { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
+ { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
+ { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
+ { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
{ X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
- { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }
+ { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
+ { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
+ { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
+ { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
+ { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
+ { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
+ { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
+ { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
+ { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
+ { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
+ { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
+ { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
+ { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
+ { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
+ { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
+ { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
+ { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
+ { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
+ { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
+ { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
+ { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
+ { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
+ { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
+ { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
+ { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
+ { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
+ { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
+ { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
+ { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
+ { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
+ { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
+ { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
+ { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
+ { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
+ { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
+ { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
+ { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
+ { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
+ { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
+ { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
+ { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
+ { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
+ { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
+ { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
+ { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
+ { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
+ { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
+ { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
+ { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
+ { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
+ { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
+ { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
+ { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
+ { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
+ { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
+ { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
+ { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
+ { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
+ { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
+ { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
+ { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
+ { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
+ { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
+ { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
+ { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
+ { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
+ { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
+ { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
+ { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
+ { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
+
+ // 512-bit three source instructions with zero masking.
+ { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
+ { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
+ { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
+ { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
+ { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
+ { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
+ { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
+ { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
+ { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
+ { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
+ { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
+ { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
+ { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
+ { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
+
+ // 256-bit three source instructions with zero masking.
+ { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
+ { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
+ { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
+ { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
+ { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
+ { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
+ { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
+ { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
+ { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
+ { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
+ { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
+ { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
+ { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
+ { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
+
+ // 128-bit three source instructions with zero masking.
+ { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
+ { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
+ { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
+ { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
+ { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
+ { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
+ { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
+ { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
+ { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
+ { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
+ { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
+ { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
+ { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
+ { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
@@ -2057,21 +2980,35 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// Index 4, folded load
Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
}
+ for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
+ if (I.getGroup()->isKMasked()) {
+ // Intrinsics need to pass TB_NO_REVERSE.
+ if (I.getGroup()->isIntrinsic()) {
+ AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
+ } else {
+ AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+ I.getRegOpcode(), I.getMemOpcode(),
+ TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
+ }
+ }
+ }
}
void
X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
MemOp2RegOpTableType &M2RTable,
uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
- if ((Flags & TB_NO_FORWARD) == 0) {
- assert(!R2MTable.count(RegOp) && "Duplicate entry!");
- R2MTable[RegOp] = std::make_pair(MemOp, Flags);
- }
- if ((Flags & TB_NO_REVERSE) == 0) {
- assert(!M2RTable.count(MemOp) &&
- "Duplicated entries in unfolding maps?");
- M2RTable[MemOp] = std::make_pair(RegOp, Flags);
- }
+ if ((Flags & TB_NO_FORWARD) == 0) {
+ assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+ R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+ }
+ if ((Flags & TB_NO_REVERSE) == 0) {
+ assert(!M2RTable.count(MemOp) &&
+ "Duplicated entries in unfolding maps?");
+ M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+ }
}
bool
@@ -2235,9 +3172,13 @@ static bool isFrameLoadOpcode(int Opcode) {
case X86::VMOVAPSZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVUPSZrm:
case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZrm:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
@@ -2305,9 +3246,13 @@ static bool isFrameStoreOpcode(int Opcode) {
case X86::VMOVUPSZmr:
case X86::VMOVUPSZ128mr:
case X86::VMOVUPSZ256mr:
+ case X86::VMOVUPSZ128mr_NOVLX:
+ case X86::VMOVUPSZ256mr_NOVLX:
case X86::VMOVAPSZmr:
case X86::VMOVAPSZ128mr:
case X86::VMOVAPSZ256mr:
+ case X86::VMOVAPSZ128mr_NOVLX:
+ case X86::VMOVAPSZ256mr_NOVLX:
case X86::VMOVUPDZmr:
case X86::VMOVUPDZ128mr:
case X86::VMOVUPDZ256mr:
@@ -2409,6 +3354,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
switch (MI.getOpcode()) {
default: break;
case X86::MOV8rm:
+ case X86::MOV8rm_NOREX:
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
@@ -2418,6 +3364,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
+ case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
case X86::VMOVSSrm:
@@ -2425,25 +3372,27 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
+ case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPSYrm:
case X86::VMOVAPDYrm:
+ case X86::VMOVUPDYrm:
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
- case X86::FsVMOVAPSrm:
- case X86::FsVMOVAPDrm:
- case X86::FsMOVAPSrm:
- case X86::FsMOVAPDrm:
// AVX-512
+ case X86::VMOVSSZrm:
+ case X86::VMOVSDZrm:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
case X86::VMOVAPDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVAPSZrm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA32Z256rm:
@@ -2463,15 +3412,20 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::VMOVDQU8Z128rm:
case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU8Zrm:
+ case X86::VMOVUPDZ128rm:
+ case X86::VMOVUPDZ256rm:
+ case X86::VMOVUPDZrm:
case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVUPSZrm: {
// Loads from constant pools are trivially rematerializable.
if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
- MI.isInvariantLoad(AA)) {
+ MI.isDereferenceableInvariantLoad(AA)) {
unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
@@ -2694,24 +3648,8 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
ImplicitOp.setImplicit();
NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
- MachineBasicBlock::LivenessQueryResult LQR =
- MI.getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
-
- switch (LQR) {
- case MachineBasicBlock::LQR_Unknown:
- // We can't give sane liveness flags to the instruction, abandon LEA
- // formation.
- return false;
- case MachineBasicBlock::LQR_Live:
- isKill = MI.killsRegister(SrcReg);
- isUndef = false;
- break;
- default:
- // The physreg itself is dead, so we have to use it as an <undef>.
- isKill = false;
- isUndef = true;
- break;
- }
+ isKill = Src.isKill();
+ isUndef = Src.isUndef();
} else {
// Virtual register of the wrong class, we have to create a temporary 64-bit
// vreg to feed into the LEA.
@@ -3079,7 +4017,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
.addOperand(Dest)
.addOperand(Src),
- MI.getOperand(2).getImm());
+ MI.getOperand(2));
break;
case X86::ADD32ri:
case X86::ADD32ri8:
@@ -3102,7 +4040,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
if (ImplicitOp.getReg() != 0)
MIB.addOperand(ImplicitOp);
- NewMI = addOffset(MIB, MI.getOperand(2).getImm());
+ NewMI = addOffset(MIB, MI.getOperand(2));
break;
}
case X86::ADD16ri:
@@ -3116,7 +4054,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest)
.addOperand(Src),
- MI.getOperand(2).getImm());
+ MI.getOperand(2));
break;
}
@@ -3133,156 +4071,236 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
return NewMI;
}
-/// Returns true if the given instruction opcode is FMA3.
-/// Otherwise, returns false.
-/// The second parameter is optional and is used as the second return from
-/// the function. It is set to true if the given instruction has FMA3 opcode
-/// that is used for lowering of scalar FMA intrinsics, and it is set to false
-/// otherwise.
-static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) {
- if (IsIntrinsic)
- *IsIntrinsic = false;
+/// This determines which of three possible cases of a three source commute
+/// the source indexes correspond to taking into account any mask operands.
+/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
+/// possible.
+/// Case 0 - Possible to commute the first and second operands.
+/// Case 1 - Possible to commute the first and third operands.
+/// Case 2 - Possible to commute the second and third operands.
+static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
- switch (Opcode) {
- case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
- case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
- case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
- case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
- case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
- case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
- case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
- case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:
-
- case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
- case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
- case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
- case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
- case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
- case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
- case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
- case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:
-
- case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
- case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
- case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
- case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
- case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
- case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
- case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
- case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:
-
- case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
- case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
- case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
- case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
- case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
- case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
- case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
- case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:
-
- case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
- case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
- case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
- case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
- case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
- case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
- case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
- case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
- case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
- case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
- case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
- case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
- case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
- case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
- case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
- case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:
-
- case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
- case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
- case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
- case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
- case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
- case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
- case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
- case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:
-
- case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
- case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
- case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
- case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
- case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
- case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
- case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
- case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
- case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
- case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
- case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
- case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
- case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
- case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
- case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
- case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:
-
- case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
- case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
- case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
- case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
- case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
- case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
- case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
- case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:
-
- case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
- case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
- case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
- case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
- case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
- case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
- case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
- case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
- case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
- case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
- case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
- case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
- case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
- case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
- case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
- case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
- return true;
+ unsigned Op1 = 1, Op2 = 2, Op3 = 3;
+ if (X86II::isKMasked(TSFlags)) {
+ // The k-mask operand cannot be commuted.
+ if (SrcOpIdx1 == 2)
+ return -1;
+
+ // For k-zero-masked operations it is Ok to commute the first vector
+ // operand.
+ // For regular k-masked operations a conservative choice is done as the
+ // elements of the first vector operand, for which the corresponding bit
+ // in the k-mask operand is set to 0, are copied to the result of the
+ // instruction.
+ // TODO/FIXME: The commute still may be legal if it is known that the
+ // k-mask operand is set to either all ones or all zeroes.
+ // It is also Ok to commute the 1st operand if all users of MI use only
+ // the elements enabled by the k-mask operand. For example,
+ // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+ // : v1[i];
+ // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+ // // Ok, to commute v1 in FMADD213PSZrk.
+ if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
+ return -1;
+ Op2++;
+ Op3++;
+ }
+
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
+ return 0;
+ if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
+ return 1;
+ if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
+ return 2;
+ return -1;
+}
- case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int:
- case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int:
- case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int:
- case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int:
- case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int:
- case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int:
- case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int:
- case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int:
-
- case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int:
- case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int:
- case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int:
- case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int:
- case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int:
- case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int:
- case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int:
- case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int:
-
- case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int:
- case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int:
- case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int:
- case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int:
- case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int:
- case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int:
- case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int:
- case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int:
- if (IsIntrinsic)
- *IsIntrinsic = true;
- return true;
- default:
- return false;
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+ const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const {
+
+ unsigned Opc = MI.getOpcode();
+
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+ // analysis. The commute optimization is legal only if all users of FMA*_Int
+ // use only the lowest element of the FMA*_Int instruction. Such analysis are
+ // not implemented yet. So, just return 0 in that case.
+ // When such analysis are available this place will be the right place for
+ // calling it.
+ if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
+ return 0;
+
+ // Determine which case this commute is or if it can't be done.
+ int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
+ if (Case < 0)
+ return 0;
+
+ // Define the FMA forms mapping array that helps to map input FMA form
+ // to output FMA form to preserve the operation semantics after
+ // commuting the operands.
+ const unsigned Form132Index = 0;
+ const unsigned Form213Index = 1;
+ const unsigned Form231Index = 2;
+ static const unsigned FormMapping[][3] = {
+ // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+ // FMA132 A, C, b; ==> FMA231 C, A, b;
+ // FMA213 B, A, c; ==> FMA213 A, B, c;
+ // FMA231 C, A, b; ==> FMA132 A, C, b;
+ { Form231Index, Form213Index, Form132Index },
+ // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+ // FMA132 A, c, B; ==> FMA132 B, c, A;
+ // FMA213 B, a, C; ==> FMA231 C, a, B;
+ // FMA231 C, a, B; ==> FMA213 B, a, C;
+ { Form132Index, Form231Index, Form213Index },
+ // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+ // FMA132 a, C, B; ==> FMA213 a, B, C;
+ // FMA213 b, A, C; ==> FMA132 b, C, A;
+ // FMA231 c, A, B; ==> FMA231 c, B, A;
+ { Form213Index, Form132Index, Form231Index }
+ };
+
+ unsigned FMAForms[3];
+ if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
+ FMAForms[0] = FMA3Group.getReg132Opcode();
+ FMAForms[1] = FMA3Group.getReg213Opcode();
+ FMAForms[2] = FMA3Group.getReg231Opcode();
+ } else {
+ FMAForms[0] = FMA3Group.getMem132Opcode();
+ FMAForms[1] = FMA3Group.getMem213Opcode();
+ FMAForms[2] = FMA3Group.getMem231Opcode();
+ }
+ unsigned FormIndex;
+ for (FormIndex = 0; FormIndex < 3; FormIndex++)
+ if (Opc == FMAForms[FormIndex])
+ break;
+
+ // Everything is ready, just adjust the FMA opcode and return it.
+ FormIndex = FormMapping[Case][FormIndex];
+ return FMAForms[FormIndex];
+}
+
+static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ // Determine which case this commute is or if it can't be done.
+ int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
+ if (Case < 0)
+ return false;
+
+ // For each case we need to swap two pairs of bits in the final immediate.
+ static const uint8_t SwapMasks[3][4] = {
+ { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
+ { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
+ { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
+ };
+
+ uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
+ // Clear out the bits we are swapping.
+ uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
+ SwapMasks[Case][2] | SwapMasks[Case][3]);
+ // If the immediate had a bit of the pair set, then set the opposite bit.
+ if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
+ if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
+ if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
+ if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
+ MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
+
+ return true;
+}
+
+// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
+// commuted.
+static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
+#define VPERM_CASES(Suffix) \
+ case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
+ case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
+ case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
+ case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
+ case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
+ case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
+ case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
+ case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
+ case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
+ case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
+ case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
+ case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
+
+#define VPERM_CASES_BROADCAST(Suffix) \
+ VPERM_CASES(Suffix) \
+ case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
+ case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
+ case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
+ case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
+ case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
+ case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
+
+ switch (Opcode) {
+ default: return false;
+ VPERM_CASES(B)
+ VPERM_CASES_BROADCAST(D)
+ VPERM_CASES_BROADCAST(PD)
+ VPERM_CASES_BROADCAST(PS)
+ VPERM_CASES_BROADCAST(Q)
+ VPERM_CASES(W)
+ return true;
}
- llvm_unreachable("Opcode not handled by the switch");
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
+}
+
+// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
+// from the I opcod to the T opcode and vice versa.
+static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
+#define VPERM_CASES(Orig, New) \
+ case X86::Orig##128rr: return X86::New##128rr; \
+ case X86::Orig##128rrkz: return X86::New##128rrkz; \
+ case X86::Orig##128rm: return X86::New##128rm; \
+ case X86::Orig##128rmkz: return X86::New##128rmkz; \
+ case X86::Orig##256rr: return X86::New##256rr; \
+ case X86::Orig##256rrkz: return X86::New##256rrkz; \
+ case X86::Orig##256rm: return X86::New##256rm; \
+ case X86::Orig##256rmkz: return X86::New##256rmkz; \
+ case X86::Orig##rr: return X86::New##rr; \
+ case X86::Orig##rrkz: return X86::New##rrkz; \
+ case X86::Orig##rm: return X86::New##rm; \
+ case X86::Orig##rmkz: return X86::New##rmkz;
+
+#define VPERM_CASES_BROADCAST(Orig, New) \
+ VPERM_CASES(Orig, New) \
+ case X86::Orig##128rmb: return X86::New##128rmb; \
+ case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
+ case X86::Orig##256rmb: return X86::New##256rmb; \
+ case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
+ case X86::Orig##rmb: return X86::New##rmb; \
+ case X86::Orig##rmbkz: return X86::New##rmbkz;
+
+ switch (Opcode) {
+ VPERM_CASES(VPERMI2B, VPERMT2B)
+ VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
+ VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
+ VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
+ VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
+ VPERM_CASES(VPERMI2W, VPERMT2W)
+ VPERM_CASES(VPERMT2B, VPERMI2B)
+ VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
+ VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
+ VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
+ VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
+ VPERM_CASES(VPERMT2W, VPERMI2W)
+ }
+
+ llvm_unreachable("Unreachable!");
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
}
MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
@@ -3352,6 +4370,39 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::MOVSDrr:
+ case X86::MOVSSrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr:{
+ // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
+ if (!Subtarget.hasSSE41())
+ return nullptr;
+
+ unsigned Mask, Opc;
+ switch (MI.getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
+ case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
+ case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
+ case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+ }
+
+ // MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy
+ // this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS.
+ auto &MRI = MI.getParent()->getParent()->getRegInfo();
+ auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg());
+ unsigned VR128 = MRI.createVirtualRegister(VR128RC);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY),
+ VR128)
+ .addReg(MI.getOperand(2).getReg());
+
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ WorkingMI.getOperand(2).setReg(VR128);
+ WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
case X86::PCLMULQDQrr:
case X86::VPCLMULQDQrr:{
// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
@@ -3364,12 +4415,24 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::CMPSDrr:
+ case X86::CMPSSrr:
case X86::CMPPDrri:
case X86::CMPPSrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr:
case X86::VCMPPDrri:
case X86::VCMPPSrri:
case X86::VCMPPDYrri:
- case X86::VCMPPSYrri: {
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri: {
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
@@ -3383,6 +4446,37 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return nullptr;
}
}
+ case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
+ case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
+ case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
+ case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
+ case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
+ case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
+ case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
+ case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
+ case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
+ case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
+ case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
+ case X86::VPCMPWZrri: case X86::VPCMPUWZrri: {
+ // Flip comparison mode immediate (if necessary).
+ unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x01: Imm = 0x06; break; // LT -> NLE
+ case 0x02: Imm = 0x05; break; // LE -> NLT
+ case 0x05: Imm = 0x02; break; // NLT -> LE
+ case 0x06: Imm = 0x01; break; // NLE -> LT
+ case 0x00: // EQ
+ case 0x03: // FALSE
+ case 0x04: // NE
+ case 0x07: // TRUE
+ break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(3).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
case X86::VPCOMBri: case X86::VPCOMUBri:
case X86::VPCOMDri: case X86::VPCOMUDri:
case X86::VPCOMQri: case X86::VPCOMUQri:
@@ -3390,6 +4484,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
case 0x00: Imm = 0x02; break; // LT -> GT
case 0x01: Imm = 0x03; break; // LE -> GE
case 0x02: Imm = 0x00; break; // GT -> LT
@@ -3398,7 +4493,6 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case 0x05: // NE
case 0x06: // FALSE
case 0x07: // TRUE
- default:
break;
}
auto &WorkingMI = cloneIfNew(MI);
@@ -3417,6 +4511,22 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::MOVHLPSrr:
+ case X86::UNPCKHPDrr: {
+ if (!Subtarget.hasSSE2())
+ return nullptr;
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
+ case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+ }
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
@@ -3490,9 +4600,44 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
- default:
- if (isFMA3(MI.getOpcode())) {
- unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
+ case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
+ case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
+ case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
+ case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
+ case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
+ auto &WorkingMI = cloneIfNew(MI);
+ if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
+ return nullptr;
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+ default: {
+ if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
+ unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.setDesc(get(Opc));
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
+
+ const X86InstrFMA3Group *FMA3Group =
+ X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+ if (FMA3Group) {
+ unsigned Opc =
+ getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
if (Opc == 0)
return nullptr;
auto &WorkingMI = cloneIfNew(MI);
@@ -3503,22 +4648,54 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
+ }
}
-bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
- unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2) const {
+bool X86InstrInfo::findFMA3CommutedOpIndices(
+ const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const {
- unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+ if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
+ return false;
+
+ // Check if we can adjust the opcode to preserve the semantics when
+ // commute the register operands.
+ return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
+}
+
+bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+
+ unsigned FirstCommutableVecOp = 1;
+ unsigned LastCommutableVecOp = 3;
+ unsigned KMaskOp = 0;
+ if (X86II::isKMasked(TSFlags)) {
+ // The k-mask operand has index = 2 for masked and zero-masked operations.
+ KMaskOp = 2;
+
+ // The operand with index = 1 is used as a source for those elements for
+ // which the corresponding bit in the k-mask is set to 0.
+ if (X86II::isKMergeMasked(TSFlags))
+ FirstCommutableVecOp = 3;
+
+ LastCommutableVecOp++;
+ }
+
+ if (isMem(MI, LastCommutableVecOp))
+ LastCommutableVecOp--;
// Only the first RegOpsNum operands are commutable.
// Also, the value 'CommuteAnyOperandIndex' is valid here as it means
// that the operand is not specified/fixed.
if (SrcOpIdx1 != CommuteAnyOperandIndex &&
- (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+ (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
+ SrcOpIdx1 == KMaskOp))
return false;
if (SrcOpIdx2 != CommuteAnyOperandIndex &&
- (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+ (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
+ SrcOpIdx2 == KMaskOp))
return false;
// Look for two different register operands assumed to be commutable
@@ -3533,7 +4710,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
if (SrcOpIdx1 == SrcOpIdx2)
// Both of operands are not fixed. By default set one of commutable
// operands to the last register operand of the instruction.
- CommutableOpIdx2 = RegOpsNum;
+ CommutableOpIdx2 = LastCommutableVecOp;
else if (SrcOpIdx2 == CommuteAnyOperandIndex)
// Only one of operands is not fixed.
CommutableOpIdx2 = SrcOpIdx1;
@@ -3541,7 +4718,12 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
// CommutableOpIdx2 is well defined now. Let's choose another commutable
// operand and assign its index to CommutableOpIdx1.
unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
- for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+ for (CommutableOpIdx1 = LastCommutableVecOp;
+ CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
+ // Just ignore and skip the k-mask operand.
+ if (CommutableOpIdx1 == KMaskOp)
+ continue;
+
// The commuted operands must have different registers.
// Otherwise, the commute transformation does not change anything and
// is useless then.
@@ -3550,7 +4732,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
}
// No appropriate commutable operands were found.
- if (CommutableOpIdx1 == 0)
+ if (CommutableOpIdx1 < FirstCommutableVecOp)
return false;
// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
@@ -3560,208 +4742,34 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
return false;
}
- // Check if we can adjust the opcode to preserve the semantics when
- // commute the register operands.
- return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
-}
-
-unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
- MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2) const {
- unsigned Opc = MI.getOpcode();
-
- // Define the array that holds FMA opcodes in groups
- // of 3 opcodes(132, 213, 231) in each group.
- static const uint16_t RegularOpcodeGroups[][3] = {
- { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r },
- { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r },
- { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r },
- { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r },
- { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY },
- { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY },
- { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m },
- { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m },
- { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m },
- { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m },
- { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY },
- { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY },
-
- { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r },
- { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r },
- { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r },
- { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r },
- { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY },
- { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY },
- { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m },
- { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m },
- { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m },
- { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m },
- { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY },
- { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY },
-
- { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r },
- { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r },
- { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r },
- { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r },
- { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
- { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
- { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m },
- { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m },
- { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m },
- { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m },
- { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
- { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },
-
- { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r },
- { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r },
- { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r },
- { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r },
- { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
- { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
- { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m },
- { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m },
- { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m },
- { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m },
- { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
- { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },
-
- { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r },
- { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r },
- { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
- { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
- { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m },
- { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m },
- { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
- { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },
-
- { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r },
- { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r },
- { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
- { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
- { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m },
- { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m },
- { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
- { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
- };
-
- // Define the array that holds FMA*_Int opcodes in groups
- // of 3 opcodes(132, 213, 231) in each group.
- static const uint16_t IntrinOpcodeGroups[][3] = {
- { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int },
- { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int },
- { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int },
- { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int },
-
- { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int },
- { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int },
- { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int },
- { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int },
-
- { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int },
- { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int },
- { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int },
- { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int },
-
- { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int },
- { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int },
- { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int },
- { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int },
- };
-
- const unsigned Form132Index = 0;
- const unsigned Form213Index = 1;
- const unsigned Form231Index = 2;
- const unsigned FormsNum = 3;
-
- bool IsIntrinOpcode;
- isFMA3(Opc, &IsIntrinOpcode);
-
- size_t GroupsNum;
- const uint16_t (*OpcodeGroups)[3];
- if (IsIntrinOpcode) {
- GroupsNum = array_lengthof(IntrinOpcodeGroups);
- OpcodeGroups = IntrinOpcodeGroups;
- } else {
- GroupsNum = array_lengthof(RegularOpcodeGroups);
- OpcodeGroups = RegularOpcodeGroups;
- }
-
- const uint16_t *FoundOpcodesGroup = nullptr;
- size_t FormIndex;
-
- // Look for the input opcode in the corresponding opcodes table.
- for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
- ++GroupIndex) {
- for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
- if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
- FoundOpcodesGroup = OpcodeGroups[GroupIndex];
- break;
- }
- }
- }
-
- // The input opcode does not match with any of the opcodes from the tables.
- // The unsupported FMA opcode must be added to one of the two opcode groups
- // defined above.
- assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
-
- // Put the lowest index to SrcOpIdx1 to simplify the checks below.
- if (SrcOpIdx1 > SrcOpIdx2)
- std::swap(SrcOpIdx1, SrcOpIdx2);
-
- // TODO: Commuting the 1st operand of FMA*_Int requires some additional
- // analysis. The commute optimization is legal only if all users of FMA*_Int
- // use only the lowest element of the FMA*_Int instruction. Such analysis are
- // not implemented yet. So, just return 0 in that case.
- // When such analysis are available this place will be the right place for
- // calling it.
- if (IsIntrinOpcode && SrcOpIdx1 == 1)
- return 0;
-
- unsigned Case;
- if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
- Case = 0;
- else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
- Case = 1;
- else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
- Case = 2;
- else
- return 0;
-
- // Define the FMA forms mapping array that helps to map input FMA form
- // to output FMA form to preserve the operation semantics after
- // commuting the operands.
- static const unsigned FormMapping[][3] = {
- // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
- // FMA132 A, C, b; ==> FMA231 C, A, b;
- // FMA213 B, A, c; ==> FMA213 A, B, c;
- // FMA231 C, A, b; ==> FMA132 A, C, b;
- { Form231Index, Form213Index, Form132Index },
- // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
- // FMA132 A, c, B; ==> FMA132 B, c, A;
- // FMA213 B, a, C; ==> FMA231 C, a, B;
- // FMA231 C, a, B; ==> FMA213 B, a, C;
- { Form132Index, Form231Index, Form213Index },
- // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
- // FMA132 a, C, B; ==> FMA213 a, B, C;
- // FMA213 b, A, C; ==> FMA132 b, C, A;
- // FMA231 c, A, B; ==> FMA231 c, B, A;
- { Form213Index, Form132Index, Form231Index }
- };
-
- // Everything is ready, just adjust the FMA opcode and return it.
- FormIndex = FormMapping[Case][FormIndex];
- return FoundOpcodesGroup[FormIndex];
+ return true;
}
bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
+ const MCInstrDesc &Desc = MI.getDesc();
+ if (!Desc.isCommutable())
+ return false;
+
switch (MI.getOpcode()) {
+ case X86::CMPSDrr:
+ case X86::CMPSSrr:
case X86::CMPPDrri:
case X86::CMPPSrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr:
case X86::VCMPPDrri:
case X86::VCMPPSrri:
case X86::VCMPPDYrri:
- case X86::VCMPPSYrri: {
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri: {
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
@@ -3776,9 +4784,73 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
}
return false;
}
+ case X86::MOVSDrr:
+ case X86::MOVSSrr:
+ case X86::VMOVSDrr:
+ case X86::VMOVSSrr: {
+ if (Subtarget.hasSSE41())
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ return false;
+ }
+ case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
+ case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
+ case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
+ case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
+ case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
+ case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
+ case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
+ case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
+ case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
+ case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
+ case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
+ case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
+ case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
+ case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+ case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+ case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
+ case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+ case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+ return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
default:
- if (isFMA3(MI.getOpcode()))
- return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ const X86InstrFMA3Group *FMA3Group =
+ X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+ if (FMA3Group)
+ return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
+
+ // Handled masked instructions since we need to skip over the mask input
+ // and the preserved input.
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // First assume that the first input is the mask operand and skip past it.
+ unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
+ unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
+ // Check if the first input is tied. If there isn't one then we only
+ // need to skip the mask operand which we did above.
+ if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
+ MCOI::TIED_TO) != -1)) {
+ // If this is zero masking instruction with a tied operand, we need to
+ // move the first index back to the first input since this must
+ // be a 3 input instruction and we want the first two non-mask inputs.
+ // Otherwise this is a 2 input instruction with a preserved input and
+ // mask, so we need to move the indices to skip one more input.
+ if (Desc.TSFlags & X86II::EVEX_Z)
+ --CommutableOpIdx1;
+ else {
+ ++CommutableOpIdx1;
+ ++CommutableOpIdx2;
+ }
+ }
+
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+
+ if (!MI.getOperand(SrcOpIdx1).isReg() ||
+ !MI.getOperand(SrcOpIdx2).isReg())
+ // No idea.
+ return false;
+ return true;
+ }
+
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
}
return false;
@@ -4296,7 +5368,10 @@ bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
return true;
}
-unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved) const {
+ assert(!BytesRemoved && "code size not handled");
+
MachineBasicBlock::iterator I = MBB.end();
unsigned Count = 0;
@@ -4316,15 +5391,17 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
return Count;
}
-unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond,
- const DebugLoc &DL) const {
+ const DebugLoc &DL,
+ int *BytesAdded) const {
// Shouldn't be a fall through.
- assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert(TBB && "insertBranch must not be told to insert a fallthrough");
assert((Cond.size() == 1 || Cond.size() == 0) &&
"X86 branch conditions have one component!");
+ assert(!BytesAdded && "code size not handled");
if (Cond.empty()) {
// Unconditional branch?
@@ -4430,16 +5507,63 @@ static bool isHReg(unsigned Reg) {
}
// Try and copy between VR128/VR64 and GR64 registers.
-static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
const X86Subtarget &Subtarget) {
+ bool HasAVX = Subtarget.hasAVX();
+ bool HasAVX512 = Subtarget.hasAVX512();
+
+ // SrcReg(MaskReg) -> DestReg(GR64)
+ // SrcReg(MaskReg) -> DestReg(GR32)
+ // SrcReg(MaskReg) -> DestReg(GR16)
+ // SrcReg(MaskReg) -> DestReg(GR8)
+
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ if (X86::VK16RegClass.contains(SrcReg)) {
+ if (X86::GR64RegClass.contains(DestReg)) {
+ assert(Subtarget.hasBWI());
+ return X86::KMOVQrk;
+ }
+ if (X86::GR32RegClass.contains(DestReg))
+ return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
+ if (X86::GR16RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return X86::KMOVWrk;
+ }
+ if (X86::GR8RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, 32);
+ return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
+ }
+ }
+
+ // SrcReg(GR64) -> DestReg(MaskReg)
+ // SrcReg(GR32) -> DestReg(MaskReg)
+ // SrcReg(GR16) -> DestReg(MaskReg)
+ // SrcReg(GR8) -> DestReg(MaskReg)
+
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ if (X86::VK16RegClass.contains(DestReg)) {
+ if (X86::GR64RegClass.contains(SrcReg)) {
+ assert(Subtarget.hasBWI());
+ return X86::KMOVQkr;
+ }
+ if (X86::GR32RegClass.contains(SrcReg))
+ return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
+ if (X86::GR16RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return X86::KMOVWkr;
+ }
+ if (X86::GR8RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, 32);
+ return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
+ }
+ }
+
// SrcReg(VR128) -> DestReg(GR64)
// SrcReg(VR64) -> DestReg(GR64)
// SrcReg(GR64) -> DestReg(VR128)
// SrcReg(GR64) -> DestReg(VR64)
- bool HasAVX = Subtarget.hasAVX();
- bool HasAVX512 = Subtarget.hasAVX512();
if (X86::GR64RegClass.contains(DestReg)) {
if (X86::VR128XRegClass.contains(SrcReg))
// Copy from a VR128 register to a GR64 register.
@@ -4479,96 +5603,13 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
return 0;
}
-static bool isMaskRegClass(const TargetRegisterClass *RC) {
- // All KMASK RegClasses hold the same k registers, can be tested against anyone.
- return X86::VK16RegClass.hasSubClassEq(RC);
-}
-
-static bool MaskRegClassContains(unsigned Reg) {
- // All KMASK RegClasses hold the same k registers, can be tested against anyone.
- return X86::VK16RegClass.contains(Reg);
-}
-
-static bool GRRegClassContains(unsigned Reg) {
- return X86::GR64RegClass.contains(Reg) ||
- X86::GR32RegClass.contains(Reg) ||
- X86::GR16RegClass.contains(Reg) ||
- X86::GR8RegClass.contains(Reg);
-}
-static
-unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) {
- if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) {
- DestReg = getX86SubSuperRegister(DestReg, 32);
- return X86::KMOVBrk;
- }
- if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) {
- SrcReg = getX86SubSuperRegister(SrcReg, 32);
- return X86::KMOVBkr;
- }
- return 0;
-}
-
-static
-unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) {
- if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg))
- return X86::KMOVQkk;
- if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg))
- return X86::KMOVDrk;
- if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg))
- return X86::KMOVQrk;
- if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg))
- return X86::KMOVDkr;
- if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg))
- return X86::KMOVQkr;
- return 0;
-}
-
-static
-unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
- const X86Subtarget &Subtarget)
-{
- if (Subtarget.hasDQI())
- if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg))
- return Opc;
- if (Subtarget.hasBWI())
- if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
- return Opc;
- if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
- if (Subtarget.hasVLX())
- return X86::VMOVAPSZ128rr;
- DestReg = get512BitSuperRegister(DestReg);
- SrcReg = get512BitSuperRegister(SrcReg);
- return X86::VMOVAPSZrr;
- }
- if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
- if (Subtarget.hasVLX())
- return X86::VMOVAPSZ256rr;
- DestReg = get512BitSuperRegister(DestReg);
- SrcReg = get512BitSuperRegister(SrcReg);
- return X86::VMOVAPSZrr;
- }
- if (X86::VR512RegClass.contains(DestReg, SrcReg))
- return X86::VMOVAPSZrr;
- if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
- return X86::KMOVWkk;
- if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
- SrcReg = getX86SubSuperRegister(SrcReg, 32);
- return X86::KMOVWkr;
- }
- if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) {
- DestReg = getX86SubSuperRegister(DestReg, 32);
- return X86::KMOVWrk;
- }
- return 0;
-}
-
void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc) const {
// First deal with the normal symmetric copies.
bool HasAVX = Subtarget.hasAVX();
- bool HasAVX512 = Subtarget.hasAVX512();
+ bool HasVLX = Subtarget.hasVLX();
unsigned Opc = 0;
if (X86::GR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MOV64rr;
@@ -4590,12 +5631,41 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
else if (X86::VR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MMX_MOVQ64rr;
- else if (HasAVX512)
- Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget);
- else if (X86::VR128RegClass.contains(DestReg, SrcReg))
- Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
- else if (X86::VR256RegClass.contains(DestReg, SrcReg))
- Opc = X86::VMOVAPSYrr;
+ else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
+ if (HasVLX)
+ Opc = X86::VMOVAPSZ128rr;
+ else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+ Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+ else {
+ // If this an extended register and we don't have VLX we need to use a
+ // 512-bit move.
+ Opc = X86::VMOVAPSZrr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
+ &X86::VR512RegClass);
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
+ &X86::VR512RegClass);
+ }
+ } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
+ if (HasVLX)
+ Opc = X86::VMOVAPSZ256rr;
+ else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSYrr;
+ else {
+ // If this an extended register and we don't have VLX we need to use a
+ // 512-bit move.
+ Opc = X86::VMOVAPSZrr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
+ &X86::VR512RegClass);
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
+ &X86::VR512RegClass);
+ }
+ } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSZrr;
+ // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+ else if (X86::VK16RegClass.contains(DestReg, SrcReg))
+ Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
if (!Opc)
Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
@@ -4708,37 +5778,15 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
llvm_unreachable("Cannot emit physreg copy instruction");
}
-static unsigned getLoadStoreMaskRegOpcode(const TargetRegisterClass *RC,
- bool load) {
- switch (RC->getSize()) {
- default:
- llvm_unreachable("Unknown spill size");
- case 2:
- return load ? X86::KMOVWkm : X86::KMOVWmk;
- case 4:
- return load ? X86::KMOVDkm : X86::KMOVDmk;
- case 8:
- return load ? X86::KMOVQkm : X86::KMOVQmk;
- }
-}
-
static unsigned getLoadStoreRegOpcode(unsigned Reg,
const TargetRegisterClass *RC,
bool isStackAligned,
const X86Subtarget &STI,
bool load) {
- if (STI.hasAVX512()) {
- if (isMaskRegClass(RC))
- return getLoadStoreMaskRegOpcode(RC, load);
- if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
- return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
- if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
- return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
- if (X86::VR512RegClass.hasSubClassEq(RC))
- return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
- }
-
bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
switch (RC->getSize()) {
default:
llvm_unreachable("Unknown spill size");
@@ -4751,69 +5799,85 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
return load ? X86::MOV8rm : X86::MOV8mr;
case 2:
+ if (X86::VK16RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVWkm : X86::KMOVWmk;
assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
return load ? X86::MOV16rm : X86::MOV16mr;
case 4:
if (X86::GR32RegClass.hasSubClassEq(RC))
return load ? X86::MOV32rm : X86::MOV32mr;
- if (X86::FR32RegClass.hasSubClassEq(RC))
+ if (X86::FR32XRegClass.hasSubClassEq(RC))
return load ?
- (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
- (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+ (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+ (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
if (X86::RFP32RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+ if (X86::VK32RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVDkm : X86::KMOVDmk;
llvm_unreachable("Unknown 4-byte regclass");
case 8:
if (X86::GR64RegClass.hasSubClassEq(RC))
return load ? X86::MOV64rm : X86::MOV64mr;
- if (X86::FR64RegClass.hasSubClassEq(RC))
+ if (X86::FR64XRegClass.hasSubClassEq(RC))
return load ?
- (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
- (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+ (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+ (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
if (X86::VR64RegClass.hasSubClassEq(RC))
return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
if (X86::RFP64RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+ if (X86::VK64RegClass.hasSubClassEq(RC))
+ return load ? X86::KMOVQkm : X86::KMOVQmk;
llvm_unreachable("Unknown 8-byte regclass");
case 10:
assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
return load ? X86::LD_Fp80m : X86::ST_FpP80m;
case 16: {
- assert((X86::VR128RegClass.hasSubClassEq(RC) ||
- X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
+ assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
// If stack is realigned we can use aligned stores.
- if (X86::VR128RegClass.hasSubClassEq(RC)) {
- if (isStackAligned)
- return load ? (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm)
- : (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
- else
- return load ? (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm)
- : (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
- }
- assert(STI.hasVLX() && "Using extended register requires VLX");
if (isStackAligned)
- return load ? X86::VMOVAPSZ128rm : X86::VMOVAPSZ128mr;
+ return load ?
+ (HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVAPSrm :
+ X86::MOVAPSrm):
+ (HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVAPSmr :
+ X86::MOVAPSmr);
else
- return load ? X86::VMOVUPSZ128rm : X86::VMOVUPSZ128mr;
+ return load ?
+ (HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVUPSrm :
+ X86::MOVUPSrm):
+ (HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVUPSmr :
+ X86::MOVUPSmr);
}
case 32:
- assert((X86::VR256RegClass.hasSubClassEq(RC) ||
- X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
+ assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
// If stack is realigned we can use aligned stores.
- if (X86::VR256RegClass.hasSubClassEq(RC)) {
- if (isStackAligned)
- return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
- else
- return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
- }
- assert(STI.hasVLX() && "Using extended register requires VLX");
if (isStackAligned)
- return load ? X86::VMOVAPSZ256rm : X86::VMOVAPSZ256mr;
+ return load ?
+ (HasVLX ? X86::VMOVAPSZ256rm :
+ HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
+ X86::VMOVAPSYrm) :
+ (HasVLX ? X86::VMOVAPSZ256mr :
+ HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
+ X86::VMOVAPSYmr);
else
- return load ? X86::VMOVUPSZ256rm : X86::VMOVUPSZ256mr;
+ return load ?
+ (HasVLX ? X86::VMOVUPSZ256rm :
+ HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
+ X86::VMOVUPSYrm) :
+ (HasVLX ? X86::VMOVUPSZ256mr :
+ HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
+ X86::VMOVUPSYmr);
case 64:
assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
- assert(STI.hasVLX() && "Using 512-bit register requires AVX512");
+ assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
if (isStackAligned)
return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
else
@@ -4851,8 +5915,7 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
Offset = DispMO.getImm();
- return MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() ==
- X86::NoRegister;
+ return true;
}
static unsigned getStoreRegOpcode(unsigned SrcReg,
@@ -4876,7 +5939,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
- assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
+ assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() &&
"Stack slot too small for store");
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned =
@@ -4954,6 +6017,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
case X86::CMP16ri:
case X86::CMP16ri8:
case X86::CMP8ri:
+ if (!MI.getOperand(1).isImm())
+ return false;
SrcReg = MI.getOperand(0).getReg();
SrcReg2 = 0;
CmpMask = ~0;
@@ -4985,6 +6050,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB8ri:
+ if (!MI.getOperand(2).isImm())
+ return false;
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
@@ -5263,9 +6330,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
// If the definition is in this basic block, RE points to the definition;
// otherwise, RE is the rend of the basic block.
MachineBasicBlock::reverse_iterator
- RI = MachineBasicBlock::reverse_iterator(I),
+ RI = ++I.getReverse(),
RE = CmpInstr.getParent() == MI->getParent()
- ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */
+ ? Def.getReverse() /* points to MI */
: CmpInstr.getParent()->rend();
MachineInstr *Movr0Inst = nullptr;
for (; RI != RE; ++RI) {
@@ -5411,9 +6478,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (Movr0Inst) {
// Look backwards until we find a def that doesn't use the current EFLAGS.
Def = Sub;
- MachineBasicBlock::reverse_iterator
- InsertI = MachineBasicBlock::reverse_iterator(++Def),
- InsertE = Sub->getParent()->rend();
+ MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
+ InsertE = Sub->getParent()->rend();
for (; InsertI != InsertE; ++InsertI) {
MachineInstr *Instr = &*InsertI;
if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
@@ -5455,14 +6521,6 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
- if (FoldAsLoadDefReg == 0)
- return nullptr;
- // To be conservative, if there exists another load, clear the load candidate.
- if (MI.mayLoad()) {
- FoldAsLoadDefReg = 0;
- return nullptr;
- }
-
// Check whether we can move DefMI here.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
@@ -5471,27 +6529,24 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
return nullptr;
// Collect information about virtual register operands of MI.
- unsigned SrcOperandId = 0;
- bool FoundSrcOperand = false;
- for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+ SmallVector<unsigned, 1> SrcOperandIds;
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
- // Do not fold if we have a subreg use or a def or multiple uses.
- if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+ // Do not fold if we have a subreg use or a def.
+ if (MO.getSubReg() || MO.isDef())
return nullptr;
-
- SrcOperandId = i;
- FoundSrcOperand = true;
+ SrcOperandIds.push_back(i);
}
- if (!FoundSrcOperand)
+ if (SrcOperandIds.empty())
return nullptr;
// Check whether we can fold the def into SrcOperandId.
- if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) {
+ if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
FoldAsLoadDefReg = 0;
return FoldMI;
}
@@ -5553,7 +6608,9 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
return true;
}
-bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
+static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
+ const TargetInstrInfo &TII,
+ const X86Subtarget &Subtarget) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
int64_t Imm = MIB->getOperand(1).getImm();
@@ -5570,23 +6627,23 @@ bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
X86MachineFunctionInfo *X86FI =
MBB.getParent()->getInfo<X86MachineFunctionInfo>();
if (X86FI->getUsesRedZone()) {
- MIB->setDesc(get(MIB->getOpcode() == X86::MOV32ImmSExti8 ? X86::MOV32ri
- : X86::MOV64ri));
+ MIB->setDesc(TII.get(MIB->getOpcode() ==
+ X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
return true;
}
// 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
// widen the register if necessary.
StackAdjustment = 8;
- BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm);
- MIB->setDesc(get(X86::POP64r));
+ BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
+ MIB->setDesc(TII.get(X86::POP64r));
MIB->getOperand(0)
.setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
} else {
assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
StackAdjustment = 4;
- BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm);
- MIB->setDesc(get(X86::POP32r));
+ BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
+ MIB->setDesc(TII.get(X86::POP32r));
}
// Build CFI if necessary.
@@ -5616,7 +6673,9 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
unsigned Reg = MIB->getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
- auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+ auto Flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant;
MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
MachineBasicBlock::iterator I = MIB.getInstr();
@@ -5629,6 +6688,53 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
}
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that loads the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &LoadDesc,
+ const MCInstrDesc &BroadcastDesc,
+ unsigned SubIdx) {
+ unsigned DestReg = MIB->getOperand(0).getReg();
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(DestReg) < 16) {
+ // We can use a normal VEX encoded load.
+ MIB->setDesc(LoadDesc);
+ } else {
+ // Use a 128/256-bit VBROADCAST instruction.
+ MIB->setDesc(BroadcastDesc);
+ // Change the destination to a 512-bit register.
+ DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(DestReg);
+ }
+ return true;
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that stores the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXStore(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &StoreDesc,
+ const MCInstrDesc &ExtractDesc,
+ unsigned SubIdx) {
+ unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(SrcReg) < 16) {
+ // We can use a normal VEX encoded store.
+ MIB->setDesc(StoreDesc);
+ } else {
+ // Use a VEXTRACTF instruction.
+ MIB->setDesc(ExtractDesc);
+ // Change the destination to a 512-bit register.
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
+ MIB.addImm(0x0); // Append immediate to extract from the lower bits.
+ }
+
+ return true;
+}
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -5641,7 +6747,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
case X86::MOV32ImmSExti8:
case X86::MOV64ImmSExti8:
- return ExpandMOVImmSExti8(MIB);
+ return ExpandMOVImmSExti8(MIB, *this, Subtarget);
case X86::SETB_C8r:
return Expand2AddrUndef(MIB, get(X86::SBB8rr));
case X86::SETB_C16r:
@@ -5663,6 +6769,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
case X86::AVX512_512_SET0:
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+ case X86::AVX512_FsFLD0SS:
+ case X86::AVX512_FsFLD0SD:
+ return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
case X86::V_SETALLONES:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
@@ -5676,6 +6785,45 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(Reg, RegState::Undef).addImm(0xff);
return true;
}
+ case X86::AVX512_512_SEXT_MASK_32:
+ case X86::AVX512_512_SEXT_MASK_64: {
+ unsigned Reg = MIB->getOperand(0).getReg();
+ unsigned MaskReg = MIB->getOperand(1).getReg();
+ unsigned MaskState = getRegState(MIB->getOperand(1));
+ unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
+ X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
+ MI.RemoveOperand(1);
+ MIB->setDesc(get(Opc));
+ // VPTERNLOG needs 3 register inputs and an immediate.
+ // 0xff will return 1s for any input.
+ MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
+ .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
+ return true;
+ }
+ case X86::VMOVAPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVUPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVAPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVUPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVAPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVUPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVAPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+ case X86::VMOVUPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
case X86::TEST8ri_NOREX:
MI.setDesc(get(X86::TEST8ri));
return true;
@@ -5801,6 +6949,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
switch (MI.getOpcode()) {
case X86::INSERTPSrr:
case X86::VINSERTPSrr:
+ case X86::VINSERTPSZrr:
// Attempt to convert the load of inserted vector into a fold load
// of a single float.
if (OpNum == 2) {
@@ -5814,8 +6963,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
int PtrOffset = SrcIdx * 4;
unsigned NewImm = (DstIdx << 4) | ZMask;
unsigned NewOpCode =
- (MI.getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
- : X86::INSERTPSrm);
+ (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm :
+ (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm :
+ X86::INSERTPSrm;
MachineInstr *NewMI =
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
@@ -5825,6 +6975,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
break;
case X86::MOVHLPSrr:
case X86::VMOVHLPSrr:
+ case X86::VMOVHLPSZrr:
// Move the upper 64-bits of the second operand to the lower 64-bits.
// To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
// TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
@@ -5832,8 +6983,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
if (Size <= RCSize && 8 <= Align) {
unsigned NewOpCode =
- (MI.getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm
- : X86::MOVLPSrm);
+ (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
+ (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
+ X86::MOVLPSrm;
MachineInstr *NewMI =
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
return NewMI;
@@ -6042,12 +7194,8 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
case X86::CVTSI2SD64rm:
case X86::CVTSD2SSrr:
case X86::CVTSD2SSrm:
- case X86::Int_CVTSD2SSrr:
- case X86::Int_CVTSD2SSrm:
case X86::CVTSS2SDrr:
case X86::CVTSS2SDrm:
- case X86::Int_CVTSS2SDrr:
- case X86::Int_CVTSS2SDrm:
case X86::MOVHPDrm:
case X86::MOVHPSrm:
case X86::MOVLPDrm:
@@ -6058,10 +7206,8 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
case X86::RCPSSm_Int:
case X86::ROUNDSDr:
case X86::ROUNDSDm:
- case X86::ROUNDSDr_Int:
case X86::ROUNDSSr:
case X86::ROUNDSSm:
- case X86::ROUNDSSr_Int:
case X86::RSQRTSSr:
case X86::RSQRTSSm:
case X86::RSQRTSSr_Int:
@@ -6134,28 +7280,95 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
case X86::Int_VCVTSS2SDrr:
case X86::Int_VCVTSS2SDrm:
case X86::VRCPSSr:
+ case X86::VRCPSSr_Int:
case X86::VRCPSSm:
case X86::VRCPSSm_Int:
case X86::VROUNDSDr:
case X86::VROUNDSDm:
case X86::VROUNDSDr_Int:
+ case X86::VROUNDSDm_Int:
case X86::VROUNDSSr:
case X86::VROUNDSSm:
case X86::VROUNDSSr_Int:
+ case X86::VROUNDSSm_Int:
case X86::VRSQRTSSr:
+ case X86::VRSQRTSSr_Int:
case X86::VRSQRTSSm:
case X86::VRSQRTSSm_Int:
case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
case X86::VSQRTSSm:
case X86::VSQRTSSm_Int:
case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
case X86::VSQRTSDm:
case X86::VSQRTSDm_Int:
- // AVX-512
+ // AVX-512
+ case X86::VCVTSI2SSZrr:
+ case X86::VCVTSI2SSZrm:
+ case X86::VCVTSI2SSZrr_Int:
+ case X86::VCVTSI2SSZrrb_Int:
+ case X86::VCVTSI2SSZrm_Int:
+ case X86::VCVTSI642SSZrr:
+ case X86::VCVTSI642SSZrm:
+ case X86::VCVTSI642SSZrr_Int:
+ case X86::VCVTSI642SSZrrb_Int:
+ case X86::VCVTSI642SSZrm_Int:
+ case X86::VCVTSI2SDZrr:
+ case X86::VCVTSI2SDZrm:
+ case X86::VCVTSI2SDZrr_Int:
+ case X86::VCVTSI2SDZrrb_Int:
+ case X86::VCVTSI2SDZrm_Int:
+ case X86::VCVTSI642SDZrr:
+ case X86::VCVTSI642SDZrm:
+ case X86::VCVTSI642SDZrr_Int:
+ case X86::VCVTSI642SDZrrb_Int:
+ case X86::VCVTSI642SDZrm_Int:
+ case X86::VCVTUSI2SSZrr:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI2SSZrr_Int:
+ case X86::VCVTUSI2SSZrrb_Int:
+ case X86::VCVTUSI2SSZrm_Int:
+ case X86::VCVTUSI642SSZrr:
+ case X86::VCVTUSI642SSZrm:
+ case X86::VCVTUSI642SSZrr_Int:
+ case X86::VCVTUSI642SSZrrb_Int:
+ case X86::VCVTUSI642SSZrm_Int:
+ case X86::VCVTUSI2SDZrr:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI2SDZrr_Int:
+ case X86::VCVTUSI2SDZrm_Int:
+ case X86::VCVTUSI642SDZrr:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI642SDZrr_Int:
+ case X86::VCVTUSI642SDZrrb_Int:
+ case X86::VCVTUSI642SDZrm_Int:
case X86::VCVTSD2SSZrr:
+ case X86::VCVTSD2SSZrrb:
case X86::VCVTSD2SSZrm:
case X86::VCVTSS2SDZrr:
+ case X86::VCVTSS2SDZrrb:
case X86::VCVTSS2SDZrm:
+ case X86::VRNDSCALESDr:
+ case X86::VRNDSCALESDrb:
+ case X86::VRNDSCALESDm:
+ case X86::VRNDSCALESSr:
+ case X86::VRNDSCALESSrb:
+ case X86::VRNDSCALESSm:
+ case X86::VRCP14SSrr:
+ case X86::VRCP14SSrm:
+ case X86::VRSQRT14SSrr:
+ case X86::VRSQRT14SSrm:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZr_Int:
+ case X86::VSQRTSSZrb_Int:
+ case X86::VSQRTSSZm:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSDZr_Int:
+ case X86::VSQRTSDZrb_Int:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
return true;
}
@@ -6233,9 +7446,17 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
return nullptr;
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- unsigned Size = MFI->getObjectSize(FrameIndex);
- unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+ // Don't fold subreg spills, or reloads that use a high subreg.
+ for (auto Op : Ops) {
+ MachineOperand &MO = MI.getOperand(Op);
+ auto SubReg = MO.getSubReg();
+ if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
+ return nullptr;
+ }
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ unsigned Size = MFI.getObjectSize(FrameIndex);
+ unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
// If the function stack isn't realigned we don't want to fold instructions
// that need increased alignment.
if (!RI.needsStackRealignment(MF))
@@ -6295,15 +7516,26 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
// instruction isn't scalar (SS).
switch (UserOpc) {
case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
+ case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int:
case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
+ case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
+ case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
- case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int:
- case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int:
- case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int:
- case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int:
- case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int:
- case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int:
+ case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
+ case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int:
+ case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int:
+ case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int:
+ case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int:
+ case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int:
+ case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int:
+ case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int:
+ case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int:
+ case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int:
+ case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int:
+ case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
+ case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
+ case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
return false;
default:
return true;
@@ -6317,15 +7549,26 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
// instruction isn't scalar (SD).
switch (UserOpc) {
case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
+ case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int:
case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
+ case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
+ case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
- case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int:
- case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int:
- case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int:
- case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int:
- case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int:
- case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int:
+ case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
+ case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int:
+ case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int:
+ case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int:
+ case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int:
+ case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int:
+ case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int:
+ case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int:
+ case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int:
+ case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int:
+ case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int:
+ case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
+ case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
+ case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
return false;
default:
return true;
@@ -6339,6 +7582,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
LiveIntervals *LIS) const {
+
+ // TODO: Support the case where LoadMI loads a wide register, but MI
+ // only uses a subreg.
+ for (auto Op : Ops) {
+ if (MI.getOperand(Op).getSubReg())
+ return nullptr;
+ }
+
// If loading from a FrameIndex, fold directly from the FrameIndex.
unsigned NumOps = LoadMI.getDesc().getNumOperands();
int FrameIndex;
@@ -6376,9 +7627,11 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Alignment = 16;
break;
case X86::FsFLD0SD:
+ case X86::AVX512_FsFLD0SD:
Alignment = 8;
break;
case X86::FsFLD0SS:
+ case X86::AVX512_FsFLD0SS:
Alignment = 4;
break;
default:
@@ -6415,7 +7668,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
case X86::FsFLD0SD:
- case X86::FsFLD0SS: {
+ case X86::AVX512_FsFLD0SD:
+ case X86::FsFLD0SS:
+ case X86::AVX512_FsFLD0SS: {
// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
// Create a constant-pool entry and operands to load from it.
@@ -6441,9 +7696,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineConstantPool &MCP = *MF.getConstantPool();
Type *Ty;
unsigned Opc = LoadMI.getOpcode();
- if (Opc == X86::FsFLD0SS)
+ if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS)
Ty = Type::getFloatTy(MF.getFunction()->getContext());
- else if (Opc == X86::FsFLD0SD)
+ else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
Ty = Type::getDoubleTy(MF.getFunction()->getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
@@ -6649,7 +7904,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
- unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
@@ -6694,7 +7949,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
- unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+ unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
SDNode *Store =
@@ -6746,8 +8001,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVSDrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
- case X86::FsMOVAPSrm:
- case X86::FsMOVAPDrm:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
@@ -6757,8 +8010,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
// AVX load instructions
case X86::VMOVSSrm:
case X86::VMOVSDrm:
- case X86::FsVMOVAPSrm:
- case X86::FsVMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -6776,6 +8027,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVSDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
@@ -6786,6 +8039,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVDQU64Z128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
@@ -6823,8 +8078,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVSDrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
- case X86::FsMOVAPSrm:
- case X86::FsMOVAPDrm:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
@@ -6834,8 +8087,6 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
// AVX load instructions
case X86::VMOVSSrm:
case X86::VMOVSDrm:
- case X86::FsVMOVAPSrm:
- case X86::FsVMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
@@ -6853,6 +8104,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVSDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
@@ -6863,6 +8116,8 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::VMOVDQU64Z128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
@@ -6960,8 +8215,8 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
return true;
}
-bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
- MachineInstr &Second) const {
+bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
+ const MachineInstr &Second) const {
// Check if this processor supports macro-fusion. Since this is a minor
// heuristic, we haven't specifically reserved a feature. hasAVX is a decent
// proxy for SandyBridge+.
@@ -7120,7 +8375,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
}
bool X86InstrInfo::
-ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
assert(Cond.size() == 1 && "Invalid X86 branch condition!");
X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
Cond[0].setImm(GetOppositeBranchCondition(CC));
@@ -7168,7 +8423,10 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
{ X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
{ X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
- { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
+ { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
+ { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr },
+ { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm },
+ { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm },
{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
@@ -7184,7 +8442,10 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
{ X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
{ X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
- { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
+ { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
+ { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr },
+ { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm },
+ { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm },
{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
@@ -7200,7 +8461,26 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
- { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }
+ { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
+ // AVX512 support
+ { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr },
+ { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+ { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+ { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr },
+ { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr },
+ { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr },
+ { X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm },
+ { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
+ { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
+ { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
+ { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
+ { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
+ { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr },
+ { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm },
+ { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
+ { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
+ { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
+ { X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm },
};
static const uint16_t ReplaceableInstrsAVX2[][3] = {
@@ -7224,22 +8504,257 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
{ X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
- { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
+ { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
+ { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
+};
+
+static const uint16_t ReplaceableInstrsAVX512[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr },
+ { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm },
+ { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr },
+ { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr },
+ { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm },
+ { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr },
+ { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm },
+ { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr },
+ { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr },
+ { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm },
+ { X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr },
+ { X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm },
+ { X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr },
+ { X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr },
+ { X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble PackedInt PackedInt
+ { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
+ { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
+ { X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm },
+ { X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr },
+ { X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm },
+ { X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr },
+ { X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm },
+ { X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr },
+ { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
+ { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
+ { X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm },
+ { X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr },
+ { X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm },
+ { X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr },
+ { X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm },
+ { X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr },
+ { X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm },
+ { X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr },
+ { X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm },
+ { X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr },
+ { X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm },
+ { X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr },
+ { X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm },
+ { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
+ // Two integer columns for 64-bit and 32-bit elements.
+ //PackedSingle PackedDouble
+ //PackedInt PackedInt
+ { X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk,
+ X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk },
+ { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
+ X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
+ { X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk,
+ X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk },
+ { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
+ X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
+ { X86::VANDPSZ128rmk, X86::VANDPDZ128rmk,
+ X86::VPANDQZ128rmk, X86::VPANDDZ128rmk },
+ { X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz,
+ X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz },
+ { X86::VANDPSZ128rrk, X86::VANDPDZ128rrk,
+ X86::VPANDQZ128rrk, X86::VPANDDZ128rrk },
+ { X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz,
+ X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz },
+ { X86::VORPSZ128rmk, X86::VORPDZ128rmk,
+ X86::VPORQZ128rmk, X86::VPORDZ128rmk },
+ { X86::VORPSZ128rmkz, X86::VORPDZ128rmkz,
+ X86::VPORQZ128rmkz, X86::VPORDZ128rmkz },
+ { X86::VORPSZ128rrk, X86::VORPDZ128rrk,
+ X86::VPORQZ128rrk, X86::VPORDZ128rrk },
+ { X86::VORPSZ128rrkz, X86::VORPDZ128rrkz,
+ X86::VPORQZ128rrkz, X86::VPORDZ128rrkz },
+ { X86::VXORPSZ128rmk, X86::VXORPDZ128rmk,
+ X86::VPXORQZ128rmk, X86::VPXORDZ128rmk },
+ { X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz,
+ X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz },
+ { X86::VXORPSZ128rrk, X86::VXORPDZ128rrk,
+ X86::VPXORQZ128rrk, X86::VPXORDZ128rrk },
+ { X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz,
+ X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz },
+ { X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk,
+ X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk },
+ { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
+ X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
+ { X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk,
+ X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk },
+ { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
+ X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
+ { X86::VANDPSZ256rmk, X86::VANDPDZ256rmk,
+ X86::VPANDQZ256rmk, X86::VPANDDZ256rmk },
+ { X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz,
+ X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz },
+ { X86::VANDPSZ256rrk, X86::VANDPDZ256rrk,
+ X86::VPANDQZ256rrk, X86::VPANDDZ256rrk },
+ { X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz,
+ X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz },
+ { X86::VORPSZ256rmk, X86::VORPDZ256rmk,
+ X86::VPORQZ256rmk, X86::VPORDZ256rmk },
+ { X86::VORPSZ256rmkz, X86::VORPDZ256rmkz,
+ X86::VPORQZ256rmkz, X86::VPORDZ256rmkz },
+ { X86::VORPSZ256rrk, X86::VORPDZ256rrk,
+ X86::VPORQZ256rrk, X86::VPORDZ256rrk },
+ { X86::VORPSZ256rrkz, X86::VORPDZ256rrkz,
+ X86::VPORQZ256rrkz, X86::VPORDZ256rrkz },
+ { X86::VXORPSZ256rmk, X86::VXORPDZ256rmk,
+ X86::VPXORQZ256rmk, X86::VPXORDZ256rmk },
+ { X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz,
+ X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz },
+ { X86::VXORPSZ256rrk, X86::VXORPDZ256rrk,
+ X86::VPXORQZ256rrk, X86::VPXORDZ256rrk },
+ { X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz,
+ X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz },
+ { X86::VANDNPSZrmk, X86::VANDNPDZrmk,
+ X86::VPANDNQZrmk, X86::VPANDNDZrmk },
+ { X86::VANDNPSZrmkz, X86::VANDNPDZrmkz,
+ X86::VPANDNQZrmkz, X86::VPANDNDZrmkz },
+ { X86::VANDNPSZrrk, X86::VANDNPDZrrk,
+ X86::VPANDNQZrrk, X86::VPANDNDZrrk },
+ { X86::VANDNPSZrrkz, X86::VANDNPDZrrkz,
+ X86::VPANDNQZrrkz, X86::VPANDNDZrrkz },
+ { X86::VANDPSZrmk, X86::VANDPDZrmk,
+ X86::VPANDQZrmk, X86::VPANDDZrmk },
+ { X86::VANDPSZrmkz, X86::VANDPDZrmkz,
+ X86::VPANDQZrmkz, X86::VPANDDZrmkz },
+ { X86::VANDPSZrrk, X86::VANDPDZrrk,
+ X86::VPANDQZrrk, X86::VPANDDZrrk },
+ { X86::VANDPSZrrkz, X86::VANDPDZrrkz,
+ X86::VPANDQZrrkz, X86::VPANDDZrrkz },
+ { X86::VORPSZrmk, X86::VORPDZrmk,
+ X86::VPORQZrmk, X86::VPORDZrmk },
+ { X86::VORPSZrmkz, X86::VORPDZrmkz,
+ X86::VPORQZrmkz, X86::VPORDZrmkz },
+ { X86::VORPSZrrk, X86::VORPDZrrk,
+ X86::VPORQZrrk, X86::VPORDZrrk },
+ { X86::VORPSZrrkz, X86::VORPDZrrkz,
+ X86::VPORQZrrkz, X86::VPORDZrrkz },
+ { X86::VXORPSZrmk, X86::VXORPDZrmk,
+ X86::VPXORQZrmk, X86::VPXORDZrmk },
+ { X86::VXORPSZrmkz, X86::VXORPDZrmkz,
+ X86::VPXORQZrmkz, X86::VPXORDZrmkz },
+ { X86::VXORPSZrrk, X86::VXORPDZrrk,
+ X86::VPXORQZrrk, X86::VPXORDZrrk },
+ { X86::VXORPSZrrkz, X86::VXORPDZrrkz,
+ X86::VPXORQZrrkz, X86::VPXORDZrrkz },
+ // Broadcast loads can be handled the same as masked operations to avoid
+ // changing element size.
+ { X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb,
+ X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb },
+ { X86::VANDPSZ128rmb, X86::VANDPDZ128rmb,
+ X86::VPANDQZ128rmb, X86::VPANDDZ128rmb },
+ { X86::VORPSZ128rmb, X86::VORPDZ128rmb,
+ X86::VPORQZ128rmb, X86::VPORDZ128rmb },
+ { X86::VXORPSZ128rmb, X86::VXORPDZ128rmb,
+ X86::VPXORQZ128rmb, X86::VPXORDZ128rmb },
+ { X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb,
+ X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb },
+ { X86::VANDPSZ256rmb, X86::VANDPDZ256rmb,
+ X86::VPANDQZ256rmb, X86::VPANDDZ256rmb },
+ { X86::VORPSZ256rmb, X86::VORPDZ256rmb,
+ X86::VPORQZ256rmb, X86::VPORDZ256rmb },
+ { X86::VXORPSZ256rmb, X86::VXORPDZ256rmb,
+ X86::VPXORQZ256rmb, X86::VPXORDZ256rmb },
+ { X86::VANDNPSZrmb, X86::VANDNPDZrmb,
+ X86::VPANDNQZrmb, X86::VPANDNDZrmb },
+ { X86::VANDPSZrmb, X86::VANDPDZrmb,
+ X86::VPANDQZrmb, X86::VPANDDZrmb },
+ { X86::VANDPSZrmb, X86::VANDPDZrmb,
+ X86::VPANDQZrmb, X86::VPANDDZrmb },
+ { X86::VORPSZrmb, X86::VORPDZrmb,
+ X86::VPORQZrmb, X86::VPORDZrmb },
+ { X86::VXORPSZrmb, X86::VXORPDZrmb,
+ X86::VPXORQZrmb, X86::VPXORDZrmb },
+ { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk,
+ X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk },
+ { X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk,
+ X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk },
+ { X86::VORPSZ128rmbk, X86::VORPDZ128rmbk,
+ X86::VPORQZ128rmbk, X86::VPORDZ128rmbk },
+ { X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk,
+ X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk },
+ { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk,
+ X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk },
+ { X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk,
+ X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk },
+ { X86::VORPSZ256rmbk, X86::VORPDZ256rmbk,
+ X86::VPORQZ256rmbk, X86::VPORDZ256rmbk },
+ { X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk,
+ X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk },
+ { X86::VANDNPSZrmbk, X86::VANDNPDZrmbk,
+ X86::VPANDNQZrmbk, X86::VPANDNDZrmbk },
+ { X86::VANDPSZrmbk, X86::VANDPDZrmbk,
+ X86::VPANDQZrmbk, X86::VPANDDZrmbk },
+ { X86::VANDPSZrmbk, X86::VANDPDZrmbk,
+ X86::VPANDQZrmbk, X86::VPANDDZrmbk },
+ { X86::VORPSZrmbk, X86::VORPDZrmbk,
+ X86::VPORQZrmbk, X86::VPORDZrmbk },
+ { X86::VXORPSZrmbk, X86::VXORPDZrmbk,
+ X86::VPXORQZrmbk, X86::VPXORDZrmbk },
+ { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz,
+ X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz},
+ { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz,
+ X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz },
+ { X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz,
+ X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz },
+ { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz,
+ X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz },
+ { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz,
+ X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz},
+ { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz,
+ X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz },
+ { X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz,
+ X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz },
+ { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz,
+ X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz },
+ { X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz,
+ X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz },
+ { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
+ X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
+ { X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
+ X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
+ { X86::VORPSZrmbkz, X86::VORPDZrmbkz,
+ X86::VPORQZrmbkz, X86::VPORDZrmbkz },
+ { X86::VXORPSZrmbkz, X86::VXORPDZrmbkz,
+ X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
};
// FIXME: Some shuffle and unpack instructions have equivalents in different
// domains, but they require a bit more work than just switching opcodes.
-static const uint16_t *lookup(unsigned opcode, unsigned domain) {
- for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+static const uint16_t *lookup(unsigned opcode, unsigned domain,
+ ArrayRef<uint16_t[3]> Table) {
+ for (const uint16_t (&Row)[3] : Table)
if (Row[domain-1] == opcode)
return Row;
return nullptr;
}
-static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
- for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
- if (Row[domain-1] == opcode)
+static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
+ ArrayRef<uint16_t[4]> Table) {
+ // If this is the integer domain make sure to check both integer columns.
+ for (const uint16_t (&Row)[4] : Table)
+ if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode))
return Row;
return nullptr;
}
@@ -7247,12 +8762,25 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
std::pair<uint16_t, uint16_t>
X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
- bool hasAVX2 = Subtarget.hasAVX2();
+ unsigned opcode = MI.getOpcode();
uint16_t validDomains = 0;
- if (domain && lookup(MI.getOpcode(), domain))
- validDomains = 0xe;
- else if (domain && lookupAVX2(MI.getOpcode(), domain))
- validDomains = hasAVX2 ? 0xe : 0x6;
+ if (domain) {
+ if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
+ validDomains = 0xe;
+ } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
+ validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+ } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
+ validDomains = 0xe;
+ } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
+ validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
+ } else if (const uint16_t *table = lookupAVX512(opcode, domain,
+ ReplaceableInstrsAVX512DQMasked)) {
+ if (domain == 1 || (domain == 3 && table[3] == opcode))
+ validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
+ else
+ validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
+ }
+ }
return std::make_pair(domain, validDomains);
}
@@ -7260,11 +8788,32 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
assert(Domain>0 && Domain<4 && "Invalid execution domain");
uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
assert(dom && "Not an SSE instruction");
- const uint16_t *table = lookup(MI.getOpcode(), dom);
+ const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
if (!table) { // try the other table
assert((Subtarget.hasAVX2() || Domain < 3) &&
"256-bit vector operations only available in AVX2");
- table = lookupAVX2(MI.getOpcode(), dom);
+ table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
+ }
+ if (!table) { // try the AVX512 table
+ assert(Subtarget.hasAVX512() && "Requires AVX-512");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
+ // Don't change integer Q instructions to D instructions.
+ if (table && Domain == 3 && table[3] == MI.getOpcode())
+ Domain = 4;
+ }
+ if (!table) { // try the AVX512DQ table
+ assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
+ // Don't change integer Q instructions to D instructions and
+ // use D intructions if we started with a PS instruction.
+ if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
+ }
+ if (!table) { // try the AVX512DQMasked table
+ assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+ table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
+ if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+ Domain = 4;
}
assert(table && "Cannot change domain");
MI.setDesc(get(table[Domain - 1]));
@@ -7275,32 +8824,6 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
NopInst.setOpcode(X86::NOOP);
}
-// This code must remain in sync with getJumpInstrTableEntryBound in this class!
-// In particular, getJumpInstrTableEntryBound must always return an upper bound
-// on the encoding lengths of the instructions generated by
-// getUnconditionalBranch and getTrap.
-void X86InstrInfo::getUnconditionalBranch(
- MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
- Branch.setOpcode(X86::JMP_1);
- Branch.addOperand(MCOperand::createExpr(BranchTarget));
-}
-
-// This code must remain in sync with getJumpInstrTableEntryBound in this class!
-// In particular, getJumpInstrTableEntryBound must always return an upper bound
-// on the encoding lengths of the instructions generated by
-// getUnconditionalBranch and getTrap.
-void X86InstrInfo::getTrap(MCInst &MI) const {
- MI.setOpcode(X86::TRAP);
-}
-
-// See getTrap and getUnconditionalBranch for conditions on the value returned
-// by this function.
-unsigned X86InstrInfo::getJumpInstrTableEntryBound() const {
- // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4
- // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B).
- return 5;
-}
-
bool X86InstrInfo::isHighLatencyDef(int opc) const {
switch (opc) {
default: return false;
@@ -7934,6 +9457,28 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
return makeArrayRef(TargetFlags);
}
+bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case X86::TCRETURNdi:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNmi64:
+ case X86::TCRETURNri64:
+ case X86::TAILJMPd:
+ case X86::TAILJMPm:
+ case X86::TAILJMPr:
+ case X86::TAILJMPd64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64_REX:
+ case X86::TAILJMPr64_REX:
+ return true;
+ default:
+ return false;
+ }
+}
+
namespace {
/// Create Global Base Reg pass. This initializes the PIC
/// global base register for x86-32.
@@ -7991,7 +9536,7 @@ namespace {
return true;
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "X86 PIC Global Base Reg Initialization";
}
@@ -8105,7 +9650,7 @@ namespace {
return Copy;
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "Local Dynamic TLS Access Clean-up";
}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
index a8a9f62..acfdef4 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrFMA3Info.h"
#include "X86RegisterInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Target/TargetInstrInfo.h"
@@ -265,7 +266,7 @@ public:
unsigned &SrcOpIdx2) const override;
/// Returns true if the routine could find two commutable operands
- /// in the given FMA instruction. Otherwise, returns false.
+ /// in the given FMA instruction \p MI. Otherwise, returns false.
///
/// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
/// The output indices of the commuted operands are returned in these
@@ -274,10 +275,12 @@ public:
/// value 'CommuteAnyOperandIndex' which means that the corresponding
/// operand index is not set and this method is free to pick any of
/// available commutable operands.
+ /// The parameter \p FMA3Group keeps the reference to the group of relative
+ /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
///
/// For example, calling this method this way:
/// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
- /// findFMA3CommutedOpIndices(MI, Idx1, Idx2);
+ /// findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
/// can be interpreted as a query asking if the operand #1 can be swapped
/// with any other available operand (e.g. operand #2, operand #3, etc.).
///
@@ -286,21 +289,30 @@ public:
/// FMA213 #1, #2, #3
/// results into instruction with adjusted opcode:
/// FMA231 #3, #2, #1
- bool findFMA3CommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2) const;
+ bool findFMA3CommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const;
/// Returns an adjusted FMA opcode that must be used in FMA instruction that
- /// performs the same computations as the given MI but which has the operands
- /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+ /// performs the same computations as the given \p MI but which has the
+ /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
/// It may return 0 if it is unsafe to commute the operands.
+ /// Note that a machine instruction (instead of its opcode) is passed as the
+ /// first parameter to make it possible to analyze the instruction's uses and
+ /// commute the first operand of FMA even when it seems unsafe when you look
+ /// at the opcode. For example, it is Ok to commute the first operand of
+ /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
///
/// The returned FMA opcode may differ from the opcode in the given \p MI.
/// For example, commuting the operands #1 and #3 in the following FMA
/// FMA213 #1, #2, #3
/// results into instruction with adjusted opcode:
/// FMA231 #3, #2, #1
- unsigned getFMA3OpcodeToCommuteOperands(MachineInstr &MI, unsigned SrcOpIdx1,
- unsigned SrcOpIdx2) const;
+ unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
+ unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const;
// Branch analysis.
bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
@@ -316,10 +328,12 @@ public:
TargetInstrInfo::MachineBranchPredicate &MBP,
bool AllowModify = false) const override;
- unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
- unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ unsigned removeBranch(MachineBasicBlock &MBB,
+ int *BytesRemoved = nullptr) const override;
+ unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
- const DebugLoc &DL) const override;
+ const DebugLoc &DL,
+ int *BytesAdded = nullptr) const override;
bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
unsigned, unsigned, int&, int&, int&) const override;
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
@@ -357,6 +371,10 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
+ /// Check whether the target can fold a load that feeds a subreg operand
+ /// (or a subreg operand that feeds a store).
+ bool isSubregFoldable() const override { return true; }
+
/// foldMemoryOperand - If this target supports it, fold a load or store of
/// the specified stack slot into the specified machine instruction for the
/// specified operand(s). If this is possible, the target should perform the
@@ -418,13 +436,13 @@ public:
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const override;
- bool shouldScheduleAdjacent(MachineInstr &First,
- MachineInstr &Second) const override;
+ bool shouldScheduleAdjacent(const MachineInstr &First,
+ const MachineInstr &Second) const override;
void getNoopForMachoTarget(MCInst &NopInst) const override;
bool
- ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
/// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
/// instruction that defines the specified register class.
@@ -467,14 +485,6 @@ public:
unsigned Size, unsigned Alignment,
bool AllowCommute) const;
- void
- getUnconditionalBranch(MCInst &Branch,
- const MCSymbolRefExpr *BranchTarget) const override;
-
- void getTrap(MCInst &MI) const override;
-
- unsigned getJumpInstrTableEntryBound() const override;
-
bool isHighLatencyDef(int opc) const override;
bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
@@ -529,6 +539,8 @@ public:
ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const override;
+ bool isTailCall(const MachineInstr &Inst) const override;
+
protected:
/// Commutes the operands in the given instruction by changing the operands
/// order and/or changing the instruction's opcode and/or the immediate value
@@ -564,8 +576,24 @@ private:
bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
int &FrameIndex) const;
- /// Expand the MOVImmSExti8 pseudo-instructions.
- bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const;
+ /// Returns true iff the routine could find two commutable operands in the
+ /// given machine instruction with 3 vector inputs.
+ /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+ /// input values can be re-defined in this method only if the input values
+ /// are not pre-defined, which is designated by the special value
+ /// 'CommuteAnyOperandIndex' assigned to it.
+ /// If both of indices are pre-defined and refer to some operands, then the
+ /// method simply returns true if the corresponding operands are commutable
+ /// and returns false otherwise.
+ ///
+ /// For example, calling this method this way:
+ /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+ /// findThreeSrcCommutedOpIndices(MI, Op1, Op2);
+ /// can be interpreted as a query asking to find an operand that would be
+ /// commutable with the operand#1.
+ bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const;
};
} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
index b19a8f3..3803671 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -765,6 +765,12 @@ def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
+// A relocatable immediate is either an immediate operand or an operand that can
+// be relocated by the linker to an immediate, such as a regular symbol in
+// non-PIC code.
+def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [],
+ 0>;
+
//===----------------------------------------------------------------------===//
// X86 Instruction Predicate Definitions.
def TruePredicate : Predicate<"true">;
@@ -832,6 +838,7 @@ def HasTBM : Predicate<"Subtarget->hasTBM()">;
def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
def HasF16C : Predicate<"Subtarget->hasF16C()">;
+def NoF16C : Predicate<"!Subtarget->hasF16C()">;
def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
def HasBMI : Predicate<"Subtarget->hasBMI()">;
@@ -876,8 +883,6 @@ def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">;
def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
-def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
- "TM.getCodeModel() != CodeModel::Kernel">;
def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
"TM.getCodeModel() == CodeModel::Kernel">;
def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
@@ -889,6 +894,7 @@ def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
+def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
//===----------------------------------------------------------------------===//
@@ -923,6 +929,7 @@ def X86_COND_S : PatLeaf<(i8 15)>;
def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
+def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
// If we have multiple users of an immediate, it's much smaller to reuse
// the register, rather than encode the immediate in every instruction.
@@ -941,13 +948,16 @@ def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
// Eventually, it would be nice to allow ConstantHoisting to merge constants
// globally for potentially added savings.
//
-def imm8_su : PatLeaf<(i8 imm), [{
+def imm8_su : PatLeaf<(i8 relocImm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm16_su : PatLeaf<(i16 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def imm16_su : PatLeaf<(i16 imm), [{
+def imm32_su : PatLeaf<(i32 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def imm32_su : PatLeaf<(i32 imm), [{
+def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
@@ -957,10 +967,9 @@ def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-
-
-def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
-
+def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
// unsigned field.
@@ -1375,7 +1384,7 @@ def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
[(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, imm:$src)], IIC_MOV>, OpSize32;
+ [(set GR32:$dst, relocImm:$src)], IIC_MOV>, OpSize32;
def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
@@ -1383,7 +1392,7 @@ def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
let isReMaterializable = 1 in {
def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
"movabs{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, imm:$src)], IIC_MOV>;
+ [(set GR64:$dst, relocImm:$src)], IIC_MOV>;
}
// Longer forms that use a ModR/M byte. Needed for disassembler
@@ -1409,7 +1418,7 @@ def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
[(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
+ [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>;
} // SchedRW
let hasSideEffects = 0 in {
@@ -2251,14 +2260,14 @@ let Predicates = [HasBMI] in {
multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
X86MemOperand x86memop, Intrinsic Int,
PatFrag ld_frag> {
- def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
- T8PS, VEX_4VOp3;
- def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ T8PS, VEX;
+ def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
- (implicit EFLAGS)]>, T8PS, VEX_4VOp3;
+ (implicit EFLAGS)]>, T8PS, VEX;
}
let Predicates = [HasBMI], Defs = [EFLAGS] in {
@@ -2626,6 +2635,12 @@ def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
+// Apply 'ret' behavior to 'retn'
+def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"retn", "ret", "intel">;
+
def : MnemonicAlias<"sal", "shl", "intel">;
def : MnemonicAlias<"salb", "shlb", "att">;
def : MnemonicAlias<"salw", "shlw", "att">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
index 8d70691..0bb1068 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -150,8 +150,9 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
/// Binary MMX instructions requiring SSSE3.
let ImmT = NoImm, Constraints = "$src1 = $dst" in {
multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
- Intrinsic IntId64, OpndItins itins> {
- let isCommutable = 0 in
+ Intrinsic IntId64, OpndItins itins,
+ bit Commutable = 0> {
+ let isCommutable = Commutable in
def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
@@ -418,9 +419,9 @@ defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
let Predicates = [HasSSE2] in
defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
MMX_PMUL_ITINS, 1>;
-let isCommutable = 1 in
defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
- int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
+ int_x86_ssse3_pmul_hr_sw,
+ MMX_PMUL_ITINS, 1>;
// -- Miscellanea
defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index f91764a..1812d01 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
InstrItinClass ri = arg_ri;
}
-
// scalar
let Sched = WriteFAdd in {
def SSE_ALU_F32S : OpndItins<
@@ -259,26 +258,24 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
-multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
- string asm, string SSEVer, string FPSizeStr,
- Operand memopr, ComplexPattern mem_cpat,
- Domain d, OpndItins itins, bit Is2Addr = 1> {
-let isCodeGenOnly = 1 in {
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
+ SDPatternOperator Int, RegisterClass RC,
+ string asm, Operand memopr,
+ ComplexPattern mem_cpat, Domain d,
+ OpndItins itins, bit Is2Addr = 1> {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (!cast<Intrinsic>(
- !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, RC:$src2))], itins.rr, d>,
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>,
Sched<[itins.Sched]>;
+ let mayLoad = 1 in
def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
- SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+ [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
@@ -372,13 +369,9 @@ def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
// Implicitly promote a 32-bit scalar to a vector.
def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
(COPY_TO_REGCLASS FR32:$src, VR128)>;
-def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
- (COPY_TO_REGCLASS FR32:$src, VR128)>;
// Implicitly promote a 64-bit scalar to a vector.
def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
(COPY_TO_REGCLASS FR64:$src, VR128)>;
-def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
- (COPY_TO_REGCLASS FR64:$src, VR128)>;
// Bitcasts between 128-bit vector types. Return the original type since
// no instruction is needed for the conversion
@@ -453,9 +446,9 @@ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero] in {
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
- [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
+ [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
- [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
+ [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>;
}
//===----------------------------------------------------------------------===//
@@ -512,6 +505,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
X86MemOperand x86memop, string base_opc,
string asm_opr, Domain d = GenericDomain> {
+ let isCommutable = 1 in
def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, RC:$src2),
!strconcat(base_opc, asm_opr),
@@ -590,6 +584,8 @@ let Predicates = [UseAVX] in {
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
@@ -609,6 +605,8 @@ let Predicates = [UseAVX] in {
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+ def : Pat<(v8f32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
@@ -697,6 +695,8 @@ let Predicates = [UseSSE1] in {
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+ def : Pat<(v4f32 (X86vzload addr:$src)),
+ (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
}
// Extract and store.
@@ -771,13 +771,12 @@ def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d,
- OpndItins itins,
- bit IsReMaterializable = 1> {
+ OpndItins itins> {
let hasSideEffects = 0 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
Sched<[WriteFShuffle]>;
-let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
@@ -795,7 +794,7 @@ defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
PS, VEX;
defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
PD, VEX;
defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
@@ -808,7 +807,7 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
PS, VEX, VEX_L;
defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
PD, VEX, VEX_L;
}
@@ -825,7 +824,7 @@ defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
"movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
PD;
defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
- "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+ "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
PD;
}
@@ -1028,7 +1027,7 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
}
-let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+let Predicates = [HasAVX, NoVLX] in {
// 128-bit load/store
def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
(VMOVAPSmr addr:$dst, VR128:$src)>;
@@ -1077,29 +1076,6 @@ let Predicates = [UseSSE1] in {
(MOVUPSmr addr:$dst, VR128:$src)>;
}
-// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
-// bits are disregarded. FIXME: Set encoding to pseudo!
-let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
-let isCodeGenOnly = 1 in {
- def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
- "movaps\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
- IIC_SSE_MOVA_P_RM>, VEX;
- def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
- "movapd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
- IIC_SSE_MOVA_P_RM>, VEX;
- def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
- "movaps\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
- IIC_SSE_MOVA_P_RM>;
- def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
- "movapd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
- IIC_SSE_MOVA_P_RM>;
-}
-}
-
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Low packed FP Instructions
//===----------------------------------------------------------------------===//
@@ -1300,6 +1276,7 @@ let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
+
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1307,6 +1284,11 @@ let Predicates = [UseAVX] in {
(VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDmr addr:$dst, VR128:$src)>;
+
+ def : Pat<(store (f64 (extractelt
(v2f64 (X86VPermilpi VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(VMOVHPDmr addr:$dst, VR128:$src)>;
@@ -1332,6 +1314,7 @@ let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
+
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1339,6 +1322,11 @@ let Predicates = [UseSSE2] in {
(MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+ (iPTR 0))), addr:$dst),
+ (MOVHPDmr addr:$dst, VR128:$src)>;
+
+ def : Pat<(store (f64 (extractelt
(v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(MOVHPDmr addr:$dst, VR128:$src)>;
@@ -1371,6 +1359,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
[(set VR128:$dst,
(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+ let isCommutable = 1 in
def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $dst|$dst, $src2}",
@@ -1449,15 +1438,18 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
itins.rm>, Sched<[itins.Sched.Folded]>;
}
-multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- X86MemOperand x86memop, string asm, Domain d,
- OpndItins itins> {
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
+ ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
+ string asm, Domain d, OpndItins itins> {
let hasSideEffects = 0 in {
- def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
- [], itins.rr, d>, Sched<[itins.Sched]>;
+ def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
+ [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))],
+ itins.rr, d>, Sched<[itins.Sched]>;
let mayLoad = 1 in
- def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
- [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
+ def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
+ [(set RC:$dst, (DstTy (sint_to_fp
+ (SrcTy (bitconvert (ld_frag addr:$src))))))],
+ itins.rm, d>, Sched<[itins.Sched.Folded]>;
}
}
@@ -1730,16 +1722,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
ssmem, sse_load_f32, "cvtss2si",
SSE_CVT_SS2SI_64>, XS, REX_W;
-defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>,
- PS, VEX, Requires<[HasAVX]>;
-defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
+ PS, VEX, Requires<[HasAVX, NoVLX]>;
+defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>,
- PS, VEX, VEX_L, Requires<[HasAVX]>;
+ PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>;
-defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, SSE_CVT_PS>,
PS, Requires<[UseSSE2]>;
@@ -1798,16 +1790,16 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
-def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
Requires<[UseAVX]>;
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fround FR64:$src))],
+ [(set FR32:$dst, (fpround FR64:$src))],
IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fround (loadf64 addr:$src)))],
+ [(set FR32:$dst, (fpround (loadf64 addr:$src)))],
IIC_SSE_CVT_Scalar_RM>,
XD,
Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
@@ -1864,9 +1856,9 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
Sched<[WriteCvtF2FLd, ReadAfterLd]>;
}
-def : Pat<(f64 (fextend FR32:$src)),
+def : Pat<(f64 (fpextend FR32:$src)),
(VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
-def : Pat<(fextend (loadf32 addr:$src)),
+def : Pat<(fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
def : Pat<(extloadf32 addr:$src),
@@ -1878,7 +1870,7 @@ def : Pat<(extloadf32 addr:$src),
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (fextend FR32:$src))],
+ [(set FR64:$dst, (fpextend FR32:$src))],
IIC_SSE_CVT_Scalar_RR>, XS,
Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
@@ -1887,12 +1879,12 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
IIC_SSE_CVT_Scalar_RM>, XS,
Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
-// extload f32 -> f64. This matches load+fextend because we have a hack in
+// extload f32 -> f64. This matches load+fpextend because we have a hack in
// the isel (PreprocessForFPConvert) that can introduce loads after dag
// combine.
-// Since these loads aren't folded into the fextend, we have to match it
+// Since these loads aren't folded into the fpextend, we have to match it
// explicitly here.
-def : Pat<(fextend (loadf32 addr:$src)),
+def : Pat<(fpextend (loadf32 addr:$src)),
(CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
def : Pat<(extloadf32 addr:$src),
(CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
@@ -1930,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
}
} // isCodeGenOnly = 1
+// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
+// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
+// vmovs{s,d} instructions
+let Predicates = [UseAVX] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseAVX]
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector
+ (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector
+ (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+ (v2f64 VR128:$dst),
+ (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE2]
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+ (v4f32 VR128:$dst),
+ (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE1]
+
// Convert packed single/double fp to doubleword
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
@@ -1962,134 +2027,98 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
// Convert Packed Double FP to Packed DW Integers
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
VEX, Sched<[WriteCvtF2I]>;
// XMM only
def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
(VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
-def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
- Sched<[WriteCvtF2ILd]>;
+def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
+ Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
// YMM only
def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+ "vcvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
- Sched<[WriteCvtF2I]>;
+ (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
+ (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
-def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
(VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
}
def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
+ (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))],
IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+ [(set VR128:$dst,
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
// Convert with truncation packed single/double fp to doubleword
// SSE2 packed instructions with XS prefix
+let Predicates = [HasAVX, NoVLX] in {
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttps2dq VR128:$src))],
+ (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttps2dq
- (loadv4f32 addr:$src)))],
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+ (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
- (loadv8f32 addr:$src)))],
+ [(set VR256:$dst,
+ (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
Sched<[WriteCvtF2ILd]>;
+}
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+ [(set VR128:$dst,
+ (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+ (v4i32 (fp_to_sint (memopv4f32 addr:$src))))],
IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
-let Predicates = [HasAVX] in {
- def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
- (VCVTDQ2PSrr VR128:$src)>;
- def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
- (VCVTDQ2PSrm addr:$src)>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
- (VCVTDQ2PSrr VR128:$src)>;
- def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
- (VCVTDQ2PSrm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (VCVTTPS2DQrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
- (VCVTTPS2DQrm addr:$src)>;
-
- def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
- (VCVTDQ2PSYrr VR256:$src)>;
- def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
- (VCVTDQ2PSYrm addr:$src)>;
-
- def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
- (VCVTTPS2DQYrr VR256:$src)>;
- def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
- (VCVTTPS2DQYrm addr:$src)>;
-}
-
-let Predicates = [UseSSE2] in {
- def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
- (CVTDQ2PSrr VR128:$src)>;
- def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
- (CVTDQ2PSrm addr:$src)>;
-
- def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
- (CVTDQ2PSrr VR128:$src)>;
- def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
- (CVTDQ2PSrm addr:$src)>;
-
- def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
- (CVTTPS2DQrr VR128:$src)>;
- def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
- (CVTTPS2DQrm addr:$src)>;
-}
-
+let Predicates = [HasAVX, NoVLX] in
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_sse2_cvttpd2dq VR128:$src))],
- IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
@@ -2098,66 +2127,92 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// XMM only
def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
-def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvttpd2dqx\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
- (loadv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+let Predicates = [HasAVX, NoVLX] in
+def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
// YMM only
+let Predicates = [HasAVX, NoVLX] in {
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+ (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
+ (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
-def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+}
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
+ (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
- (VCVTTPD2DQYrr VR256:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
- (VCVTTPD2DQYrm addr:$src)>;
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (VCVTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (VCVTTPD2DQrr VR128:$src)>;
+ }
} // Predicates = [HasAVX]
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
- (memopv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>,
- Sched<[WriteCvtF2ILd]>;
+ [(set VR128:$dst,
+ (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))],
+ IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [UseSSE2] in {
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+ (CVTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+ (CVTTPD2DQrr VR128:$src)>;
+ }
+} // Predicates = [UseSSE2]
// Convert packed single to packed double
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
// SSE2 instructions without OpSize prefix
def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+ [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+ [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+ [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
}
let Predicates = [UseSSE2] in {
def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+ [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
+ IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
@@ -2165,136 +2220,118 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
}
// Convert Packed DW Integers to Packed Double FP
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
let hasSideEffects = 0, mayLoad = 1 in
def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX, Sched<[WriteCvtI2FLd]>;
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+ VEX, Sched<[WriteCvtI2FLd]>;
def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX, Sched<[WriteCvtI2F]>;
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
+ VEX, Sched<[WriteCvtI2F]>;
def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
+ [(set VR256:$dst,
+ (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+ VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
- []>, VEX, VEX_L, Sched<[WriteCvtI2F]>;
+ [(set VR256:$dst,
+ (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
+ VEX, VEX_L, Sched<[WriteCvtI2F]>;
}
let hasSideEffects = 0, mayLoad = 1 in
def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (X86VSintToFP (v4i32 VR128:$src))))],
IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
// AVX register conversion intrinsics
-let Predicates = [HasAVX] in {
- def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
- (VCVTDQ2PDrr VR128:$src)>;
- def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
- (VCVTDQ2PDrm addr:$src)>;
- def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTDQ2PDrm addr:$src)>;
-
- def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
- (VCVTDQ2PDYrr VR128:$src)>;
- def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
- (VCVTDQ2PDYrm addr:$src)>;
-} // Predicates = [HasAVX]
+} // Predicates = [HasAVX, NoVLX]
// SSE2 register conversion intrinsics
-let Predicates = [HasSSE2] in {
- def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))),
- (CVTDQ2PDrr VR128:$src)>;
- def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))),
- (CVTDQ2PDrm addr:$src)>;
- def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(CVTDQ2PDrm addr:$src)>;
-} // Predicates = [HasSSE2]
+} // Predicates = [UseSSE2]
// Convert packed double to packed single
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
+let Predicates = [HasAVX, NoVLX] in
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
// XMM only
def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
(VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
-def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- "cvtpd2psx\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
- IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+let Predicates = [HasAVX, NoVLX] in
+def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
+ IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
// YMM only
+let Predicates = [HasAVX, NoVLX] in {
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
- "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (fpround VR256:$src))],
IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
+ [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
-def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+}
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
(VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
+ (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>;
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+ [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+ [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))],
IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
-
// AVX 256-bit register conversion intrinsics
// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
// whenever possible to avoid declaring two versions of each one.
-let Predicates = [HasAVX] in {
- def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
- (VCVTDQ2PSYrr VR256:$src)>;
- def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
- (VCVTDQ2PSYrm addr:$src)>;
-}
let Predicates = [HasAVX, NoVLX] in {
- // Match fround and fextend for 128/256-bit conversions
- def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+ // Match fpround and fpextend for 128/256-bit conversions
+ let AddedComplexity = 15 in
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
(VCVTPD2PSrr VR128:$src)>;
- def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
- (VCVTPD2PSXrm addr:$src)>;
- def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
- (VCVTPD2PSYrr VR256:$src)>;
- def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
- (VCVTPD2PSYrm addr:$src)>;
-
- def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
- (VCVTPS2PDrr VR128:$src)>;
- def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
- (VCVTPS2PDYrr VR128:$src)>;
- def : Pat<(v4f64 (extloadv4f32 addr:$src)),
- (VCVTPS2PDYrm addr:$src)>;
}
let Predicates = [UseSSE2] in {
- // Match fround and fextend for 128 conversions
- def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+ // Match fpround and fpextend for 128 conversions
+ let AddedComplexity = 15 in
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
(CVTPD2PSrr VR128:$src)>;
- def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
- (CVTPD2PSrm addr:$src)>;
-
- def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
- (CVTPS2PDrr VR128:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -2306,6 +2343,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
Operand CC, SDNode OpNode, ValueType VT,
PatFrag ld_frag, string asm, string asm_alt,
OpndItins itins, ImmLeaf immLeaf> {
+ let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
@@ -2351,9 +2389,9 @@ let Constraints = "$src1 = $dst" in {
SSE_ALU_F64S, i8immZExt3>, XD;
}
-multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
+multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
Intrinsic Int, string asm, OpndItins itins,
- ImmLeaf immLeaf> {
+ ImmLeaf immLeaf, ComplexPattern mem_cpat> {
def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
@@ -2361,30 +2399,30 @@ multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
itins.rr>,
Sched<[itins.Sched]>;
def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
+ (ins VR128:$src1, memop:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- (load addr:$src), immLeaf:$cc))],
+ mem_cpat:$src, immLeaf:$cc))],
itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let isCodeGenOnly = 1 in {
// Aliases to match intrinsics which expect XMM operand(s).
- defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
+ defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- SSE_ALU_F32S, i8immZExt5>,
+ SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
XS, VEX_4V;
- defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
+ defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- SSE_ALU_F32S, i8immZExt5>, // same latency as f32
+ SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
XD, VEX_4V;
let Constraints = "$src1 = $dst" in {
- defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
+ defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $dst|$dst, $src}",
- SSE_ALU_F32S, i8immZExt3>, XS;
- defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
+ SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
+ defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $dst|$dst, $src}",
- SSE_ALU_F64S, i8immZExt3>,
+ SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
XD;
}
}
@@ -2407,6 +2445,23 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
Sched<[WriteFAddLd, ReadAfterLd]>;
}
+// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
+multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
+ ValueType vt, Operand memop,
+ ComplexPattern mem_cpat, string OpcodeStr> {
+ def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+ IIC_SSE_COMIS_RR>,
+ Sched<[WriteFAdd]>;
+ def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1),
+ mem_cpat:$src2))],
+ IIC_SSE_COMIS_RM>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
let Defs = [EFLAGS] in {
defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss">, PS, VEX, VEX_LIG;
@@ -2420,15 +2475,15 @@ let Defs = [EFLAGS] in {
}
let isCodeGenOnly = 1 in {
- defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
- load, "ucomiss">, PS, VEX;
- defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
- load, "ucomisd">, PD, VEX;
-
- defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
- load, "comiss">, PS, VEX;
- defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
- load, "comisd">, PD, VEX;
+ defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss">, PS, VEX;
+ defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd">, PD, VEX;
+
+ defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss">, PS, VEX;
+ defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd">, PD, VEX;
}
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss">, PS;
@@ -2443,15 +2498,15 @@ let Defs = [EFLAGS] in {
}
let isCodeGenOnly = 1 in {
- defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
- load, "ucomiss">, PS;
- defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
- load, "ucomisd">, PD;
-
- defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
- "comiss">, PS;
- defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
- "comisd">, PD;
+ defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss">, PS;
+ defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd">, PD;
+
+ defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss">, PS;
+ defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd">, PD;
}
} // Defs = [EFLAGS]
@@ -2641,7 +2696,8 @@ let Predicates = [UseSSE2] in {
multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
PatFrag mem_frag, RegisterClass RC,
X86MemOperand x86memop, string asm,
- Domain d> {
+ Domain d, bit IsCommutable = 0> {
+ let isCommutable = IsCommutable in
def rr : PI<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
asm, [(set RC:$dst,
@@ -2689,7 +2745,7 @@ let Constraints = "$src1 = $dst" in {
SSEPackedSingle>, PS;
defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
- SSEPackedDouble>, PD;
+ SSEPackedDouble, 1>, PD;
defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
SSEPackedSingle>, PS;
@@ -2810,84 +2866,6 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
// SSE 1 & 2 - Logical Instructions
//===----------------------------------------------------------------------===//
-// Multiclass for scalars using the X86 logical operation aliases for FP.
-multiclass sse12_fp_packed_scalar_logical_alias<
- bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
- defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>,
- PS, VEX_4V;
-
- defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>,
- PD, VEX_4V;
-
- let Constraints = "$src1 = $dst" in {
- defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
- f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS;
-
- defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
- f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD;
- }
-}
-
-let isCodeGenOnly = 1 in {
- defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand,
- SSE_BIT_ITINS_P>;
- defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for,
- SSE_BIT_ITINS_P>;
- defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor,
- SSE_BIT_ITINS_P>;
-
- let isCommutable = 0 in
- defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn,
- SSE_BIT_ITINS_P>;
-}
-
-// Multiclass for vectors using the X86 logical operation aliases for FP.
-multiclass sse12_fp_packed_vector_logical_alias<
- bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> {
- let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
- defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>,
- PS, VEX_4V;
-
- defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
- PD, VEX_4V;
-
- defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
- PS, VEX_4V, VEX_L;
-
- defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
- PD, VEX_4V, VEX_L;
- }
-
- let Constraints = "$src1 = $dst" in {
- defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
- v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>,
- PS;
-
- defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
- v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>,
- PD;
- }
-}
-
-let isCodeGenOnly = 1 in {
- defm FvAND : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand,
- SSE_BIT_ITINS_P>;
- defm FvOR : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for,
- SSE_BIT_ITINS_P>;
- defm FvXOR : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor,
- SSE_BIT_ITINS_P>;
-
- let isCommutable = 0 in
- defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn,
- SSE_BIT_ITINS_P>;
-}
-
/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
///
multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
@@ -2895,7 +2873,8 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f256mem,
- [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+ (bc_v4i64 (v8f32 VR256:$src2))))],
[(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
(loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
@@ -2907,12 +2886,10 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
(loadv4i64 addr:$src2)))], 0>,
PD, VEX_4V, VEX_L;
- // In AVX no need to add a pattern for 128-bit logical rr ps, because they
- // are all promoted to v2i64, and the patterns are covered by the int
- // version. This is needed in SSE only, because v2i64 isn't supported on
- // SSE1, but only on SSE2.
defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
- !strconcat(OpcodeStr, "ps"), f128mem, [],
+ !strconcat(OpcodeStr, "ps"), f128mem,
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (bc_v2i64 (v4f32 VR128:$src2))))],
[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
(loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
@@ -2928,7 +2905,8 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f128mem,
- [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
+ [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+ (bc_v2i64 (v4f32 VR128:$src2))))],
[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
(memopv2i64 addr:$src2)))]>, PS;
@@ -2947,19 +2925,124 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
let isCommutable = 0 in
defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
-// AVX1 requires type coercions in order to fold loads directly into logical
-// operations.
+// If only AVX1 is supported, we need to handle integer operations with
+// floating point instructions since the integer versions aren't available.
let Predicates = [HasAVX1Only] in {
- def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+ def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
+ (VANDPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
+ (VORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
+ (VXORPSYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
+ (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+
+ def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
(VANDPSYrm VR256:$src1, addr:$src2)>;
- def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+ def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
(VORPSYrm VR256:$src1, addr:$src2)>;
- def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+ def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
(VXORPSYrm VR256:$src1, addr:$src2)>;
- def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+ def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
(VANDNPSYrm VR256:$src1, addr:$src2)>;
}
+let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VANDPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VXORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (VANDNPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+
+ def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VANDPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VXORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (VANDNPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
+let Predicates = [UseSSE1] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ANDPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (XORPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+ def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+ (COPY_TO_REGCLASS (ANDNPSrr
+ (COPY_TO_REGCLASS FR32:$src1, VR128),
+ (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
+let Predicates = [UseSSE2] in {
+ // Use packed logical operations for scalar ops.
+ def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ANDPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (XORPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+ def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+ (COPY_TO_REGCLASS (ANDNPDrr
+ (COPY_TO_REGCLASS FR64:$src1, VR128),
+ (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+}
+
+// Patterns for packed operations when we don't have integer type available.
+def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
+ (ANDPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
+ (ORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
+ (XORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
+ (ANDNPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
+ (ANDPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
+ (ORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
+ (XORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
+ (ANDNPSrm VR128:$src1, addr:$src2)>;
+
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Arithmetic Instructions
//===----------------------------------------------------------------------===//
@@ -3025,20 +3108,22 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+ SDPatternOperator IntSS,
+ SDPatternOperator IntSD,
SizeItins itins> {
- defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+ defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+ !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
- defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+ defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+ !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
let Constraints = "$src1 = $dst" in {
- defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+ defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+ !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
SSEPackedSingle, itins.s>, XS;
- defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+ defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+ !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
SSEPackedDouble, itins.d>, XD;
}
}
@@ -3046,23 +3131,29 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
// Binary Arithmetic instructions
defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag,
+ SSE_ALU_ITINS_S>;
defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag,
+ SSE_MUL_ITINS_S>;
let isCommutable = 0 in {
defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag,
+ SSE_ALU_ITINS_S>;
defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag,
+ SSE_DIV_ITINS_S>;
defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss,
+ int_x86_sse2_max_sd, SSE_ALU_ITINS_S>;
defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
+ basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss,
+ int_x86_sse2_min_sd, SSE_ALU_ITINS_S>;
}
let isCodeGenOnly = 1 in {
@@ -3145,9 +3236,15 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
}
- // Repeat everything for AVX, except for the movss + scalar combo...
- // because that one shouldn't occur with AVX codegen?
- let Predicates = [HasAVX] in {
+ // Repeat everything for AVX.
+ let Predicates = [UseAVX] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
// extracted scalar math op with insert via blend
def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
@@ -3203,7 +3300,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
}
// Repeat everything for AVX.
- let Predicates = [HasAVX] in {
+ let Predicates = [UseAVX] in {
// extracted scalar math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
(Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
@@ -3287,8 +3384,8 @@ def SSE_RCPS : OpndItins<
/// the HW instructions are 2 operand / destructive.
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
- X86MemOperand x86memop, Operand vec_memop,
- ComplexPattern mem_cpat, Intrinsic Intr,
+ X86MemOperand x86memop,
+ Intrinsic Intr,
SDNode OpNode, Domain d, OpndItins itins,
Predicate target, string Suffix> {
let hasSideEffects = 0 in {
@@ -3308,23 +3405,17 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let mayLoad = 1 in
- def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2),
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
let Predicates = [target] in {
- def : Pat<(vt (OpNode mem_cpat:$src)),
- (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int)
- (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>;
// These are unary operations, but they are modeled as having 2 source operands
// because the high elements of the destination are unchanged in SSE.
def : Pat<(Intr VR128:$src),
(!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
- def : Pat<(Intr (load addr:$src)),
- (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
- addr:$src), VR128))>;
}
// We don't want to fold scalar loads into these instructions unless
// optimizing for size. This is because the folded instruction will have a
@@ -3334,16 +3425,15 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
// which has a clobber before the rcp, vs.
// rcpss mem, %xmm0
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr mem_cpat:$src),
+ def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
(!cast<Instruction>(NAME#Suffix##m_Int)
- (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
}
}
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
- X86MemOperand x86memop, Operand vec_memop,
- ComplexPattern mem_cpat,
+ X86MemOperand x86memop,
Intrinsic Intr, SDNode OpNode, Domain d,
OpndItins itins, string Suffix> {
let hasSideEffects = 0 in {
@@ -3361,7 +3451,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
[]>, Sched<[itins.Sched.Folded]>;
let mayLoad = 1 in
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, vec_memop:$src2),
+ (ins VR128:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -3382,21 +3472,18 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
}
let Predicates = [HasAVX] in {
def : Pat<(Intr VR128:$src),
- (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
+ (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
VR128:$src)>;
}
let Predicates = [HasAVX, OptForSize] in {
- def : Pat<(Intr mem_cpat:$src),
+ def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
- (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
}
let Predicates = [UseAVX, OptForSize] in {
def : Pat<(ScalarVT (OpNode (load addr:$src))),
(!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
addr:$src)>;
- def : Pat<(vt (OpNode mem_cpat:$src)),
- (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
- mem_cpat:$src)>;
}
}
@@ -3475,11 +3562,10 @@ let Predicates = [HasAVX] in {
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins> {
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
- ssmem, sse_load_f32,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
- f32mem, ssmem, sse_load_f32,
+ f32mem,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
}
@@ -3487,11 +3573,10 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins> {
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
- sdmem, sse_load_f64,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
- f64mem, sdmem, sse_load_f64,
+ f64mem,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, "SD">,
XD, VEX_4V, VEX_LIG;
@@ -3805,13 +3890,14 @@ def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
}
let SchedRW = [WriteMove] in {
-let hasSideEffects = 0 in
+let hasSideEffects = 0 in {
def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
[], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
// For Disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
@@ -3874,85 +3960,12 @@ def SSE_PMADD : OpndItins<
let ExeDomain = SSEPackedInt in { // SSE integer instructions
-multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
- RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop,
- OpndItins itins,
- bit IsCommutable = 0,
- bit Is2Addr = 1> {
- let isCommutable = IsCommutable in
- def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
- Sched<[itins.Sched]>;
- def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, x86memop:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-}
-
-multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
- Intrinsic IntId256, OpndItins itins,
- bit IsCommutable = 0> {
-let Predicates = [HasAVX] in
- defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
- VR128, loadv2i64, i128mem, itins,
- IsCommutable, 0>, VEX_4V;
-
-let Constraints = "$src1 = $dst" in
- defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
- i128mem, itins, IsCommutable, 1>;
-
-let Predicates = [HasAVX2] in
- defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
- VR256, loadv4i64, i256mem, itins,
- IsCommutable, 0>, VEX_4V, VEX_L;
-}
-
-multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
- string OpcodeStr, SDNode OpNode,
- SDNode OpNode2, RegisterClass RC,
- ValueType DstVT, ValueType SrcVT,
- PatFrag ld_frag, ShiftOpndItins itins,
- bit Is2Addr = 1> {
- // src2 is always 128-bit
- def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, VR128:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
- itins.rr>, Sched<[WriteVecShift]>;
- def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, i128mem:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode RC:$src1,
- (SrcVT (bitconvert (ld_frag addr:$src2))))))], itins.rm>,
- Sched<[WriteVecShiftLd, ReadAfterLd]>;
- def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
- (ins RC:$src1, u8imm:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
- Sched<[WriteVecShift]>;
-}
-
/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType DstVT, ValueType SrcVT, RegisterClass RC,
PatFrag memop_frag, X86MemOperand x86memop,
- OpndItins itins,
- bit IsCommutable = 0, bit Is2Addr = 1> {
- let isCommutable = IsCommutable in
+ OpndItins itins, bit Is2Addr = 1> {
+ let isCommutable = 1 in
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
@@ -3984,9 +3997,9 @@ defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
@@ -4022,184 +4035,141 @@ defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
-// Intrinsic forms
-defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
- int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+ loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
+ VR256, loadv4i64, i256mem, SSE_PMADD,
+ 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+ memopv2i64, i128mem, SSE_PMADD>;
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
- loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
VEX_4V;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
- loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
VEX_4V, VEX_L;
let Constraints = "$src1 = $dst" in
defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
- memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>;
+ memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
let Predicates = [HasAVX, NoVLX] in
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
- loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+ loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
VEX_4V;
let Predicates = [HasAVX2, NoVLX] in
defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
VR256, loadv4i64, i256mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+ SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L;
let Constraints = "$src1 = $dst" in
defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
- memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
+ memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Logical Instructions
//===---------------------------------------------------------------------===//
-let Predicates = [HasAVX, NoVLX] in {
-defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
- VR128, v4i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
- VR128, v2i64, v2i64, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-
-defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
- VR128, v4i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
- VR128, v2i64, v2i64, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-
-defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
- VR128, v4i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-} // Predicates = [HasAVX, NoVLX]
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, RegisterClass RC,
+ ValueType DstVT, ValueType SrcVT,
+ PatFrag ld_frag, bit Is2Addr = 1> {
+ // src2 is always 128-bit
+ def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, VR128:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+ SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>;
+ def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, i128mem:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode RC:$src1,
+ (SrcVT (bitconvert (ld_frag addr:$src2))))))],
+ SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+ def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+ (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))],
+ SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>;
+}
-let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
- VR128, v8i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
- VR128, v8i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
- VR128, v8i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
-} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
-
-
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] ,
- Predicates = [HasAVX, NoVLX_Or_NoBWI]in {
- // 128-bit logical shifts.
- def VPSLLDQri : PDIi8<0x73, MRM7r,
- (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
- "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst,
- (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>,
- VEX_4V;
- def VPSRLDQri : PDIi8<0x73, MRM3r,
- (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
- "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst,
- (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>,
- VEX_4V;
- // PSRADQri doesn't exist in SSE[1-3].
-} // Predicates = [HasAVX, NoVLX_Or_NoBWI]
+multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, SDNode OpNode,
+ SDNode OpNode2, ValueType DstVT128,
+ ValueType DstVT256, ValueType SrcVT,
+ Predicate prd> {
+let Predicates = [HasAVX, prd] in
+ defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+ OpNode, OpNode2, VR128, DstVT128, SrcVT,
+ loadv2i64, 0>, VEX_4V;
+let Predicates = [HasAVX2, prd] in
+ defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+ OpNode, OpNode2, VR256, DstVT256, SrcVT,
+ loadv2i64, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
+ VR128, DstVT128, SrcVT, memopv2i64>;
+}
-let Predicates = [HasAVX2, NoVLX] in {
-defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
- VR256, v8i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
- VR256, v4i64, v2i64, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-
-defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
- VR256, v8i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
- VR256, v4i64, v2i64, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-
-defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
- VR256, v8i32, v4i32, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-}// Predicates = [HasAVX2, NoVLX]
+multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
+ SDNode OpNode, RegisterClass RC, ValueType VT,
+ bit Is2Addr = 1> {
+ def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))],
+ IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>;
+}
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
- VR256, v16i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
- VR256, v16i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
- VR256, v16i16, v8i16, loadv2i64,
- SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
-}// Predicates = [HasAVX2, NoVLX_Or_NoBWI]
-
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 ,
- Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- // 256-bit logical shifts.
- def VPSLLDQYri : PDIi8<0x73, MRM7r,
- (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
- "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR256:$dst,
- (v32i8 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>,
- VEX_4V, VEX_L;
- def VPSRLDQYri : PDIi8<0x73, MRM3r,
- (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2),
- "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR256:$dst,
- (v32i8 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>,
- VEX_4V, VEX_L;
- // PSRADQYri doesn't exist in SSE[1-3].
-} // Predicates = [HasAVX2, NoVLX_Or_NoBWI]
+multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
+ SDNode OpNode> {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+ defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+ VR128, v16i8, 0>, VEX_4V;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+ defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+ VR256, v32i8, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
+}
-let Constraints = "$src1 = $dst" in {
-defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
- VR128, v8i16, v8i16, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
- VR128, v4i32, v4i32, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
- VR128, v2i64, v2i64, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-
-defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
- VR128, v8i16, v8i16, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
- VR128, v4i32, v4i32, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
- VR128, v2i64, v2i64, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-
-defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
- VR128, v8i16, v8i16, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
- VR128, v4i32, v4i32, memopv2i64,
- SSE_INTSHIFT_ITINS_P>;
-
-let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
- // 128-bit logical shifts.
- def PSLLDQri : PDIi8<0x73, MRM7r,
- (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
- "pslldq\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst,
- (v16i8 (X86vshldq VR128:$src1, (i8 imm:$src2))))],
- IIC_SSE_INTSHDQ_P_RI>;
- def PSRLDQri : PDIi8<0x73, MRM3r,
- (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
- "psrldq\t{$src2, $dst|$dst, $src2}",
- [(set VR128:$dst,
- (v16i8 (X86vshrdq VR128:$src1, (i8 imm:$src2))))],
- IIC_SSE_INTSHDQ_P_RI>;
+let ExeDomain = SSEPackedInt in {
+ defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+ v4i32, v8i32, v4i32, NoVLX>;
+ defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+ v2i64, v4i64, v2i64, NoVLX>;
+
+ defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+ v4i32, v8i32, v4i32, NoVLX>;
+ defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+ v2i64, v4i64, v2i64, NoVLX>;
+
+ defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+ v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+ defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+ v4i32, v8i32, v4i32, NoVLX>;
+
+ defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>;
+ defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>;
// PSRADQri doesn't exist in SSE[1-3].
-}
-} // Constraints = "$src1 = $dst"
+} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Comparison Instructions
@@ -4651,6 +4621,7 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
//===---------------------------------------------------------------------===//
// Move Int Doubleword to Packed Double Int
//
+let ExeDomain = SSEPackedInt in {
def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@@ -4701,11 +4672,12 @@ def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert GR64:$src))],
IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
// Move Int Doubleword to Single Scalar
//
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert GR32:$src))],
@@ -4725,11 +4697,12 @@ let isCodeGenOnly = 1 in {
"movd\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int to Packed Double Int
//
+let ExeDomain = SSEPackedInt in {
def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (extractelt (v4i32 VR128:$src),
@@ -4751,6 +4724,7 @@ def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
[(store (i32 (extractelt (v4i32 VR128:$src),
(iPTR 0))), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
@@ -4767,6 +4741,7 @@ def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int first element to Doubleword Int
//
+let ExeDomain = SSEPackedInt in {
let SchedRW = [WriteMove] in {
def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
@@ -4791,11 +4766,12 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
"mov{d|q}\t{$src, $dst|$dst, $src}",
[], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
// Bitcast FR64 <-> GR64
//
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in
def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
@@ -4822,12 +4798,12 @@ let isCodeGenOnly = 1 in {
"movq\t{$src, $dst|$dst, $src}",
[(store (i64 (bitconvert FR64:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
//===---------------------------------------------------------------------===//
// Move Scalar Single to Double Int
//
-let isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32:$src))],
@@ -4844,7 +4820,7 @@ let isCodeGenOnly = 1 in {
"movd\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
-}
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
let Predicates = [UseAVX] in {
let AddedComplexity = 15 in {
@@ -4867,9 +4843,13 @@ let Predicates = [UseAVX] in {
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
(VMOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (VMOVDI2PDIrm addr:$src)>;
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+ def : Pat<(v8i32 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
}
// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
@@ -4892,6 +4872,8 @@ let Predicates = [UseSSE2] in {
(MOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
(MOVDI2PDIrm addr:$src)>;
+ def : Pat<(v4i32 (X86vzload addr:$src)),
+ (MOVDI2PDIrm addr:$src)>;
}
}
@@ -4960,43 +4942,30 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
(VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
-//===---------------------------------------------------------------------===//
-// Store / copy lower 64-bits of a XMM register.
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in {
-def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
- (loadi64 addr:$src))))))],
- IIC_SSE_MOVDQ>,
- XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
-
-def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst,
- (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
- (loadi64 addr:$src))))))],
- IIC_SSE_MOVDQ>,
- XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
-} // ExeDomain, isCodeGenOnly, AddedComplexity
-
let Predicates = [UseAVX], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VMOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (VMOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
- (VMOVZQI2PQIrm addr:$src)>;
+ (VMOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
- (VMOVZQI2PQIrm addr:$src)>;
+ (VMOVQI2PQIrm addr:$src)>;
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
- (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>;
+ (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
}
let Predicates = [UseSSE2], AddedComplexity = 20 in {
+ def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (MOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
- (MOVZQI2PQIrm addr:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+ (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
}
//===---------------------------------------------------------------------===//
@@ -5018,24 +4987,6 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
XS, Requires<[UseSSE2]>;
} // ExeDomain, SchedRW
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
-let AddedComplexity = 20 in
-def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2i64 (X86vzmovl
- (loadv2i64 addr:$src))))],
- IIC_SSE_MOVDQ>,
- XS, VEX, Requires<[UseAVX]>;
-let AddedComplexity = 20 in {
-def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
- "movq\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2i64 (X86vzmovl
- (loadv2i64 addr:$src))))],
- IIC_SSE_MOVDQ>,
- XS, Requires<[UseSSE2]>;
-}
-} // ExeDomain, isCodeGenOnly, SchedRW
-
let AddedComplexity = 20 in {
let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
@@ -5167,12 +5118,12 @@ let Predicates = [HasAVX] in {
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
}
-let Predicates = [UseAVX, OptForSize] in {
- def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
- (VMOVDDUPrm addr:$src)>;
- def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
- (VMOVDDUPrm addr:$src)>;
-}
+let Predicates = [HasAVX, NoVLX] in
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
+let Predicates = [HasAVX1Only] in
+def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
let Predicates = [UseSSE3] in {
def : Pat<(X86Movddup (memopv2f64 addr:$src)),
@@ -5370,35 +5321,35 @@ let Constraints = "$src1 = $dst" in {
/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
SDNode OpNode, PatFrag ld_frag> {
- def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (vt (OpNode VR128:$src)))],
- IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
+ def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (vt (OpNode VR128:$src)))],
+ IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
- def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins i128mem:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst,
- (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
- IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
+ def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
+ IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
}
/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
SDNode OpNode> {
- def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
- Sched<[WriteVecALU]>;
+ def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
+ Sched<[WriteVecALU]>;
- def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
- (ins i256mem:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst,
- (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
- Sched<[WriteVecALULd]>;
+ def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst,
+ (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+ Sched<[WriteVecALULd]>;
}
// Helper fragments to match sext vXi1 to vXiY.
@@ -5419,19 +5370,21 @@ let Predicates = [HasAVX, NoVLX] in {
defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
def : Pat<(xor
(bc_v2i64 (v16i1sextv16i8)),
(bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
- (VPABSBrr128 VR128:$src)>;
+ (VPABSBrr VR128:$src)>;
def : Pat<(xor
(bc_v2i64 (v8i1sextv8i16)),
(bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
- (VPABSWrr128 VR128:$src)>;
+ (VPABSWrr VR128:$src)>;
+}
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(xor
(bc_v2i64 (v4i1sextv4i32)),
(bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
- (VPABSDrr128 VR128:$src)>;
+ (VPABSDrr VR128:$src)>;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
@@ -5442,19 +5395,21 @@ let Predicates = [HasAVX2, NoVLX] in {
defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
}
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(xor
(bc_v4i64 (v32i1sextv32i8)),
(bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
- (VPABSBrr256 VR256:$src)>;
+ (VPABSBYrr VR256:$src)>;
def : Pat<(xor
(bc_v4i64 (v16i1sextv16i16)),
(bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
- (VPABSWrr256 VR256:$src)>;
+ (VPABSWYrr VR256:$src)>;
+}
+let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(xor
(bc_v4i64 (v8i1sextv8i32)),
(bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
- (VPABSDrr256 VR256:$src)>;
+ (VPABSDYrr VR256:$src)>;
}
defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
@@ -5465,15 +5420,15 @@ let Predicates = [UseSSSE3] in {
def : Pat<(xor
(bc_v2i64 (v16i1sextv16i8)),
(bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
- (PABSBrr128 VR128:$src)>;
+ (PABSBrr VR128:$src)>;
def : Pat<(xor
(bc_v2i64 (v8i1sextv8i16)),
(bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
- (PABSWrr128 VR128:$src)>;
+ (PABSWrr VR128:$src)>;
def : Pat<(xor
(bc_v2i64 (v4i1sextv4i32)),
(bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
- (PABSDrr128 VR128:$src)>;
+ (PABSDrr VR128:$src)>;
}
//===---------------------------------------------------------------------===//
@@ -5506,16 +5461,16 @@ def SSE_PMULHRSW : OpndItins<
/// SS3I_binop_rm - Simple SSSE3 bin op
multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, OpndItins itins,
- bit Is2Addr = 1> {
+ ValueType DstVT, ValueType OpVT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins, bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>,
Sched<[itins.Sched]>;
def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
@@ -5523,7 +5478,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1,
+ (DstVT (OpNode (OpVT RC:$src1),
(bitconvert (memop_frag addr:$src2)))))], itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -5568,18 +5523,32 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
Sched<[Sched.Folded, ReadAfterLd]>;
}
+let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+ defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
+ VR128, loadv2i64, i128mem,
+ SSE_PSHUFB, 0>, VEX_4V;
+ defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
+ v16i8, VR128, loadv2i64, i128mem,
+ SSE_PMADD, 0>, VEX_4V;
+}
+defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
+ VR128, loadv2i64, i128mem,
+ SSE_PMULHRSW, 0>, VEX_4V;
+}
+
let ImmT = NoImm, Predicates = [HasAVX] in {
let isCommutable = 0 in {
- defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
+ defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
loadv2i64, i128mem,
SSE_PHADDSUBW, 0>, VEX_4V;
- defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
+ defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
loadv2i64, i128mem,
SSE_PHADDSUBD, 0>, VEX_4V;
- defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
+ defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
loadv2i64, i128mem,
SSE_PHADDSUBW, 0>, VEX_4V;
- defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
+ defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
loadv2i64, i128mem,
SSE_PHADDSUBD, 0>, VEX_4V;
defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
@@ -5591,36 +5560,41 @@ let isCommutable = 0 in {
defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
int_x86_ssse3_psign_d_128,
SSE_PSIGN, loadv2i64, 0>, VEX_4V;
- defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
- loadv2i64, i128mem,
- SSE_PSHUFB, 0>, VEX_4V;
defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
int_x86_ssse3_phadd_sw_128,
SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
int_x86_ssse3_phsub_sw_128,
SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
- defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
- int_x86_ssse3_pmadd_ub_sw_128,
- SSE_PMADD, loadv2i64, 0>, VEX_4V;
}
-defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
- int_x86_ssse3_pmul_hr_sw_128,
- SSE_PMULHRSW, loadv2i64, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+ defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
+ VR256, loadv4i64, i256mem,
+ SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+ defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
+ v32i8, VR256, loadv4i64, i256mem,
+ SSE_PMADD, 0>, VEX_4V, VEX_L;
+}
+defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
+ SSE_PMULHRSW, 0>, VEX_4V, VEX_L;
}
let ImmT = NoImm, Predicates = [HasAVX2] in {
let isCommutable = 0 in {
- defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
- loadv4i64, i256mem,
+ defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
- defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
+ defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
loadv4i64, i256mem,
SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
- defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
- loadv4i64, i256mem,
+ defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
+ VR256, loadv4i64, i256mem,
SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
- defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
+ defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
loadv4i64, i256mem,
SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
defm VPSIGNBY : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
@@ -5629,34 +5603,25 @@ let isCommutable = 0 in {
WriteVecALU>, VEX_4V, VEX_L;
defm VPSIGNDY : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
WriteVecALU>, VEX_4V, VEX_L;
- defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
- loadv4i64, i256mem,
- SSE_PSHUFB, 0>, VEX_4V, VEX_L;
defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
int_x86_avx2_phadd_sw,
WriteVecALU>, VEX_4V, VEX_L;
defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
int_x86_avx2_phsub_sw,
WriteVecALU>, VEX_4V, VEX_L;
- defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
- int_x86_avx2_pmadd_ub_sw,
- WriteVecIMul>, VEX_4V, VEX_L;
}
-defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
- int_x86_avx2_pmul_hr_sw,
- WriteVecIMul>, VEX_4V, VEX_L;
}
// None of these have i8 immediate fields.
let ImmT = NoImm, Constraints = "$src1 = $dst" in {
let isCommutable = 0 in {
- defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
+ defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
memopv2i64, i128mem, SSE_PHADDSUBW>;
- defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
+ defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
memopv2i64, i128mem, SSE_PHADDSUBD>;
- defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
+ defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
memopv2i64, i128mem, SSE_PHADDSUBW>;
- defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
+ defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
memopv2i64, i128mem, SSE_PHADDSUBD>;
defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
SSE_PSIGN, memopv2i64>;
@@ -5664,7 +5629,7 @@ let isCommutable = 0 in {
SSE_PSIGN, memopv2i64>;
defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
SSE_PSIGN, memopv2i64>;
- defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
+ defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
memopv2i64, i128mem, SSE_PSHUFB>;
defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
int_x86_ssse3_phadd_sw_128,
@@ -5672,13 +5637,12 @@ let isCommutable = 0 in {
defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
int_x86_ssse3_phsub_sw_128,
SSE_PHADDSUBSW, memopv2i64>;
- defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
- int_x86_ssse3_pmadd_ub_sw_128,
- SSE_PMADD, memopv2i64>;
+ defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
+ v16i8, VR128, memopv2i64, i128mem,
+ SSE_PMADD>;
}
-defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw",
- int_x86_ssse3_pmul_hr_sw_128,
- SSE_PMULHRSW, memopv2i64>;
+defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
+ VR128, memopv2i64, i128mem, SSE_PMULHRSW>;
}
//===---------------------------------------------------------------------===//
@@ -5895,8 +5859,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
- def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
}
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5923,8 +5885,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
(!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
- def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
@@ -5941,8 +5901,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
- def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
- (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
}
}
@@ -6342,10 +6300,10 @@ let Predicates = [UseAVX] in {
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
-multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
- X86MemOperand x86memop, RegisterClass RC,
- PatFrag mem_frag32, PatFrag mem_frag64,
- Intrinsic V4F32Int, Intrinsic V2F64Int> {
+multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ PatFrag mem_frag32, PatFrag mem_frag64,
+ Intrinsic V4F32Int, Intrinsic V2F64Int> {
let ExeDomain = SSEPackedSingle in {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
@@ -6386,24 +6344,73 @@ let ExeDomain = SSEPackedDouble in {
} // ExeDomain = SSEPackedDouble
}
-multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
- string OpcodeStr,
- Intrinsic F32Int,
- Intrinsic F64Int, bit Is2Addr = 1> {
-let ExeDomain = GenericDomain in {
- // Operation, reg.
- let hasSideEffects = 0 in
+multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
- (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
- !if(Is2Addr,
- !strconcat(OpcodeStr,
- "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !strconcat(OpcodeStr,
- "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAdd]>;
- // Intrinsic operation, reg.
- let isCodeGenOnly = 1 in
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6414,8 +6421,7 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
Sched<[WriteFAdd]>;
- // Intrinsic operation, mem.
- def SSm : SS4AIi8<opcss, MRMSrcMem,
+ def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
@@ -6426,19 +6432,6 @@ let ExeDomain = GenericDomain in {
(F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
- // Operation, reg.
- let hasSideEffects = 0 in
- def SDr : SS4AIi8<opcsd, MRMSrcReg,
- (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
- !if(Is2Addr,
- !strconcat(OpcodeStr,
- "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !strconcat(OpcodeStr,
- "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- []>, Sched<[WriteFAdd]>;
-
- // Intrinsic operation, reg.
- let isCodeGenOnly = 1 in
def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6449,8 +6442,7 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
Sched<[WriteFAdd]>;
- // Intrinsic operation, mem.
- def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
@@ -6460,23 +6452,24 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst,
(F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
-} // ExeDomain = GenericDomain
+} // ExeDomain = GenericDomain, isCodeGenOnly = 1
}
// FP round - roundss, roundps, roundsd, roundpd
let Predicates = [HasAVX] in {
// Intrinsic form
- defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
- loadv4f32, loadv2f64,
- int_x86_sse41_round_ps,
- int_x86_sse41_round_pd>, VEX;
- defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
- loadv8f32, loadv4f64,
- int_x86_avx_round_ps_256,
- int_x86_avx_round_pd_256>, VEX, VEX_L;
- defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
- int_x86_sse41_round_ss,
- int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+ defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
+ loadv4f32, loadv2f64,
+ int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>, VEX;
+ defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
+ loadv8f32, loadv4f64,
+ int_x86_avx_round_ps_256,
+ int_x86_avx_round_pd_256>, VEX, VEX_L;
+ defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
+ int_x86_sse41_round_ss,
+ int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
}
let Predicates = [UseAVX] in {
@@ -6548,34 +6541,37 @@ let Predicates = [HasAVX] in {
(VROUNDYPDr VR256:$src, (i32 0xB))>;
}
-defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
- memopv4f32, memopv2f64,
- int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
+ memopv4f32, memopv2f64, int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>;
+
+defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;
+
let Constraints = "$src1 = $dst" in
-defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
let Predicates = [UseSSE41] in {
def : Pat<(ffloor FR32:$src),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+ (ROUNDSSr FR32:$src, (i32 0x9))>;
def : Pat<(f64 (ffloor FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+ (ROUNDSDr FR64:$src, (i32 0x9))>;
def : Pat<(f32 (fnearbyint FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+ (ROUNDSSr FR32:$src, (i32 0xC))>;
def : Pat<(f64 (fnearbyint FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ (ROUNDSDr FR64:$src, (i32 0xC))>;
def : Pat<(f32 (fceil FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+ (ROUNDSSr FR32:$src, (i32 0xA))>;
def : Pat<(f64 (fceil FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+ (ROUNDSDr FR64:$src, (i32 0xA))>;
def : Pat<(f32 (frint FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+ (ROUNDSSr FR32:$src, (i32 0x4))>;
def : Pat<(f64 (frint FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+ (ROUNDSDr FR64:$src, (i32 0x4))>;
def : Pat<(f32 (ftrunc FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+ (ROUNDSSr FR32:$src, (i32 0xB))>;
def : Pat<(f64 (ftrunc FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+ (ROUNDSDr FR64:$src, (i32 0xB))>;
def : Pat<(v4f32 (ffloor VR128:$src)),
(ROUNDPSr VR128:$src, (i32 0x9))>;
@@ -6867,10 +6863,10 @@ let Constraints = "$src1 = $dst" in {
let Predicates = [HasAVX, NoVLX] in {
defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
- memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
+ loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
VEX_4V;
defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
- memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+ loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
VEX_4V;
}
let Predicates = [HasAVX2] in {
@@ -7029,22 +7025,22 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
RegisterClass RC, X86MemOperand x86memop,
PatFrag mem_frag, Intrinsic IntId,
X86FoldableSchedWrite Sched> {
- def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
+ def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
- NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+ NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
Sched<[Sched]>;
- def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
+ def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
(IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
RC:$src3))],
- NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
+ NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
Sched<[Sched.Folded, ReadAfterLd]>;
}
@@ -7139,17 +7135,6 @@ let Predicates = [UseAVX] in {
(VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
}
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0),
- (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
- sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0),
- (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
- sub_xmm)>;
-
// These will incur an FP/int domain crossing penalty, but it may be the only
// way without AVX2. Do not add any complexity because we may be able to match
// more optimal patterns defined earlier in this file.
@@ -7744,6 +7729,7 @@ defm : pclmul_alias<"lqlq", 0x00>;
let Predicates = [HasSSE4A] in {
+let ExeDomain = SSEPackedInt in {
let Constraints = "$src = $dst" in {
def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
(ins VR128:$src, u8imm:$len, u8imm:$idx),
@@ -7767,6 +7753,7 @@ def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
[(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
VR128:$mask))]>, XD;
}
+} // ExeDomain = SSEPackedInt
// Non-temporal (unaligned) scalar stores.
let AddedComplexity = 400 in { // Prefer non-temporal versions
@@ -7832,23 +7819,50 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
v4f64, v2f64, WriteFShuffle256>, VEX_L;
+//===----------------------------------------------------------------------===//
+// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
+// halves of a 256-bit vector.
+//
let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
(ins i128mem:$src),
"vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteLoad]>, VEX, VEX_L;
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
(ins f128mem:$src),
- "vbroadcastf128\t{$src, $dst|$dst, $src}",
- [(set VR256:$dst,
- (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
+ "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteFShuffleLd]>, VEX, VEX_L;
-let Predicates = [HasAVX] in
-def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTI128 addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF128 addr:$src)>;
+def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
+ (VBROADCASTF128 addr:$src)>;
+}
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+ (VBROADCASTF128 addr:$src)>;
+}
//===----------------------------------------------------------------------===//
// VINSERTF128 - Insert packed floating-point values
@@ -7865,63 +7879,29 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
[]>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
}
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
+ PatFrag memop_frag> {
+ def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
(iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
+ (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+ def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
+ (From (bitconvert (memop_frag addr:$src2))),
+ (iPTR imm)),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+ (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
-def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
+let Predicates = [HasAVX, NoVLX] in {
+ defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
+ defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
- (iPTR imm)),
- (VINSERTF128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
- (bc_v4i32 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
- (bc_v16i8 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
- (bc_v8i16 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTF128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
+ defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>;
}
//===----------------------------------------------------------------------===//
@@ -7939,61 +7919,28 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
[]>, Sched<[WriteStore]>, VEX, VEX_L;
}
+multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
+ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+ (To (!cast<Instruction>(InstrStr#rr)
+ (From VR256:$src1),
+ (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+ def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
+ (iPTR imm))), addr:$dst),
+ (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
+ (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
// AVX1 patterns
let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v4f32 (VEXTRACTF128rr
- (v8f32 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v2f64 (VEXTRACTF128rr
- (v4f64 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-
-def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
+ defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
+ defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v2i64 (VEXTRACTF128rr
- (v4i64 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v4i32 (VEXTRACTF128rr
- (v8i32 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v8i16 (VEXTRACTF128rr
- (v16i16 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v16i8 (VEXTRACTF128rr
- (v32i8 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-
-def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTF128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
+ defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
+ defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
+ defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
}
//===----------------------------------------------------------------------===//
@@ -8239,7 +8186,7 @@ let Predicates = [HasF16C] in {
}
// Patterns for matching conversions from float to half-float and vice versa.
-let Predicates = [HasF16C] in {
+let Predicates = [HasF16C, NoVLX] in {
// Use MXCSR.RC for rounding instead of explicitly specifying the default
// rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
// configurations we support (the default). However, falling back to MXCSR is
@@ -8334,7 +8281,7 @@ defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
v2i64, v4i64, NoVLX>;
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
// This means we'll encounter truncated i32 loads; match that here.
def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -8347,7 +8294,9 @@ let Predicates = [HasAVX2] in {
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
+}
+let Predicates = [HasAVX2] in {
// Provide aliases for broadcast from the same register class that
// automatically does the extract.
def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
@@ -8361,36 +8310,38 @@ let Predicates = [HasAVX2] in {
let Predicates = [HasAVX2, NoVLX] in {
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
- let AddedComplexity = 20 in {
def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
(VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
(VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
(VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
- }
}
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in {
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
(VPBROADCASTBrr (COPY_TO_REGCLASS
- (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit)),
VR128))>;
def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
(VPBROADCASTBYrr (COPY_TO_REGCLASS
- (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR8:$src, sub_8bit)),
VR128))>;
def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
(VPBROADCASTWrr (COPY_TO_REGCLASS
- (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit)),
VR128))>;
def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
(VPBROADCASTWYrr (COPY_TO_REGCLASS
- (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+ GR16:$src, sub_16bit)),
VR128))>;
}
-let Predicates = [HasAVX2, NoVLX], AddedComplexity = 20 in {
+let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
(VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
@@ -8418,13 +8369,13 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
-let Predicates = [HasAVX], AddedComplexity = 20 in {
+let Predicates = [HasAVX, NoVLX] in {
// 128bit broadcasts:
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
}
-let Predicates = [HasAVX, NoVLX], AddedComplexity = 20 in {
+let Predicates = [HasAVX1Only] in {
def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
(VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
@@ -8560,42 +8511,10 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
}
let Predicates = [HasAVX2, NoVLX] in {
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
- (iPTR imm)),
- (VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
- (iPTR imm)),
- (VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
- (iPTR imm)),
- (VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
- (iPTR imm)),
- (VINSERTI128rr VR256:$src1, VR128:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
- (iPTR imm)),
- (VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
- (bc_v4i32 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
- (bc_v16i8 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
- (bc_v8i16 (loadv2i64 addr:$src2)),
- (iPTR imm)),
- (VINSERTI128rm VR256:$src1, addr:$src2,
- (INSERT_get_vinsert128_imm VR256:$ins))>;
+ defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
+ defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>;
}
//===----------------------------------------------------------------------===//
@@ -8612,39 +8531,10 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
Sched<[WriteStore]>, VEX, VEX_L;
let Predicates = [HasAVX2, NoVLX] in {
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v2i64 (VEXTRACTI128rr
- (v4i64 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v4i32 (VEXTRACTI128rr
- (v8i32 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v8i16 (VEXTRACTI128rr
- (v16i16 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
- (v16i8 (VEXTRACTI128rr
- (v32i8 VR256:$src1),
- (EXTRACT_get_vextract128_imm VR128:$ext)))>;
-
-def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
- (iPTR imm))), addr:$dst),
- (VEXTRACTI128mr addr:$dst, VR256:$src1,
- (EXTRACT_get_vextract128_imm VR128:$ext))>;
+ defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
+ defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
+ defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
+ defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
}
//===----------------------------------------------------------------------===//
@@ -8689,12 +8579,12 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
(!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
// masked load
- def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
- def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
(VT (bitconvert (ZeroVT immAllZerosV))))),
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
- def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
+ def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
(!cast<Instruction>(BlendStr#"rr")
RC:$src0,
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
@@ -8719,6 +8609,51 @@ let Predicates = [HasAVX2] in {
defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
}
+
+//===----------------------------------------------------------------------===//
+// SubVector Broadcasts
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2i64 VR128:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4i32 VR128:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v8i16 VR128:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
+ (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v16i8 VR128:$src), 1)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2f64 VR128:$src), 1)>;
+def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4f32 VR128:$src), 1)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v2i64 VR128:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v4i32 VR128:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v8i16 VR128:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
+ (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+ (v16i8 VR128:$src), 1)>;
+}
+
//===----------------------------------------------------------------------===//
// Variable Bit Shifts
//
@@ -8758,23 +8693,35 @@ let Predicates = [HasAVX2, NoVLX] in {
defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
- let isCodeGenOnly = 1 in
- defm VPSRAVD_Int : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
+
+ def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
+ (VPSRAVDrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (X86vsrav VR128:$src1,
+ (bitconvert (loadv2i64 addr:$src2)))),
+ (VPSRAVDrm VR128:$src1, addr:$src2)>;
+ def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
+ (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
+ def : Pat<(v8i32 (X86vsrav VR256:$src1,
+ (bitconvert (loadv4i64 addr:$src2)))),
+ (VPSRAVDYrm VR256:$src1, addr:$src2)>;
}
+
+
+
//===----------------------------------------------------------------------===//
// VGATHER - GATHER Operations
multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
X86MemOperand memop128, X86MemOperand memop256> {
- def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+ def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
(ins VR128:$src1, memop128:$src2, VR128:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX_4VOp3;
- def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+ []>, VEX;
+ def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
(ins RC256:$src1, memop256:$src2, RC256:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX_4VOp3, VEX_L;
+ []>, VEX, VEX_L;
}
let mayLoad = 1, hasSideEffects = 0, Constraints
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
index c1df978..e2be735 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -591,37 +591,38 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"ror{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))],
+ IIC_SR>;
def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"ror{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))],
IIC_SR>, OpSize16;
def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"ror{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))],
IIC_SR>, OpSize32;
def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"ror{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))],
IIC_SR>;
// Rotate by 1
def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t$dst",
- [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))],
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))],
IIC_SR>;
def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t$dst",
- [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))],
IIC_SR>, OpSize16;
def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t$dst",
- [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))],
IIC_SR>, OpSize32;
def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t$dst",
- [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))],
IIC_SR>;
} // Constraints = "$src = $dst", SchedRW
@@ -873,19 +874,19 @@ let hasSideEffects = 0 in {
multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
let hasSideEffects = 0 in {
- def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+ def rr : I<0xF7, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
- VEX_4VOp3, Sched<[WriteShift]>;
+ VEX, Sched<[WriteShift]>;
let mayLoad = 1 in
- def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+ def rm : I<0xF7, MRMSrcMem4VOp3,
+ (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
- VEX_4VOp3,
- Sched<[WriteShiftLd,
- // x86memop:$src1
- ReadDefault, ReadDefault, ReadDefault, ReadDefault,
- ReadDefault,
- // RC:$src1
- ReadAfterLd]>;
+ VEX, Sched<[WriteShiftLd,
+ // x86memop:$src1
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadDefault,
+ // RC:$src1
+ ReadAfterLd]>;
}
}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
index 6667bd2..9265d64 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -23,7 +23,7 @@ let Defs = [RAX, RCX, RDX] in
// CPU flow control instructions
-let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+let mayLoad = 1, mayStore = 0, hasSideEffects = 1 in {
def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
}
@@ -481,8 +481,11 @@ let Defs = [EDX, EAX], Uses = [ECX] in
def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
let Uses = [EDX, EAX, ECX] in
- def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
-}
+ def XSETBV : I<0x01, MRM_D1, (outs), (ins),
+ "xsetbv",
+ [(int_x86_xsetbv ECX, EDX, EAX)]>, TB;
+
+} // HasXSAVE
let Uses = [EDX, EAX] in {
let Predicates = [HasXSAVE] in {
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h
new file mode 100755
index 0000000..415a891
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h
@@ -0,0 +1,1162 @@
+//===-- X86InstrTablesInfo.h - X86 Instruction Tables -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains related X86 Instruction Information Tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
+
+using namespace llvm;
+
+struct X86EvexToVexCompressTableEntry {
+ uint16_t EvexOpcode;
+ uint16_t VexOpcode;
+};
+
+
+
+// X86 EVEX encoded instructions that have a VEX 128 encoding
+// (table format: <EVEX opcode, VEX-128 opcode>).
+static const X86EvexToVexCompressTableEntry X86EvexToVex128CompressTable[] = {
+ // EVEX scalar with corresponding VEX.
+ { X86::Int_VCOMISDZrm , X86::Int_VCOMISDrm },
+ { X86::Int_VCOMISDZrr , X86::Int_VCOMISDrr },
+ { X86::Int_VCOMISSZrm , X86::Int_VCOMISSrm },
+ { X86::Int_VCOMISSZrr , X86::Int_VCOMISSrr },
+ { X86::Int_VUCOMISDZrm , X86::Int_VUCOMISDrm },
+ { X86::Int_VUCOMISDZrr , X86::Int_VUCOMISDrr },
+ { X86::Int_VUCOMISSZrm , X86::Int_VUCOMISSrm },
+ { X86::Int_VUCOMISSZrr , X86::Int_VUCOMISSrr },
+ { X86::VADDSDZrm , X86::VADDSDrm },
+ { X86::VADDSDZrm_Int , X86::VADDSDrm_Int },
+ { X86::VADDSDZrr , X86::VADDSDrr },
+ { X86::VADDSDZrr_Int , X86::VADDSDrr_Int },
+ { X86::VADDSSZrm , X86::VADDSSrm },
+ { X86::VADDSSZrm_Int , X86::VADDSSrm_Int },
+ { X86::VADDSSZrr , X86::VADDSSrr },
+ { X86::VADDSSZrr_Int , X86::VADDSSrr_Int },
+ { X86::VCOMISDZrm , X86::VCOMISDrm },
+ { X86::VCOMISDZrr , X86::VCOMISDrr },
+ { X86::VCOMISSZrm , X86::VCOMISSrm },
+ { X86::VCOMISSZrr , X86::VCOMISSrr },
+ { X86::VCVTSD2SI64Zrm , X86::VCVTSD2SI64rm },
+ { X86::VCVTSD2SI64Zrr , X86::VCVTSD2SI64rr },
+ { X86::VCVTSD2SIZrm , X86::VCVTSD2SIrm },
+ { X86::VCVTSD2SIZrr , X86::VCVTSD2SIrr },
+ { X86::VCVTSD2SSZrm , X86::VCVTSD2SSrm },
+ { X86::VCVTSD2SSZrr , X86::VCVTSD2SSrr },
+ { X86::VCVTSI2SDZrm , X86::VCVTSI2SDrm },
+ { X86::VCVTSI2SDZrm_Int , X86::Int_VCVTSI2SDrm },
+ { X86::VCVTSI2SDZrr , X86::VCVTSI2SDrr },
+ { X86::VCVTSI2SDZrr_Int , X86::Int_VCVTSI2SDrr },
+ { X86::VCVTSI2SSZrm , X86::VCVTSI2SSrm },
+ { X86::VCVTSI2SSZrm_Int , X86::Int_VCVTSI2SSrm },
+ { X86::VCVTSI2SSZrr , X86::VCVTSI2SSrr },
+ { X86::VCVTSI2SSZrr_Int , X86::Int_VCVTSI2SSrr },
+ { X86::VCVTSS2SDZrm , X86::VCVTSS2SDrm },
+ { X86::VCVTSS2SDZrr , X86::VCVTSS2SDrr },
+ { X86::VCVTSS2SI64Zrm , X86::VCVTSS2SI64rm },
+ { X86::VCVTSS2SI64Zrr , X86::VCVTSS2SI64rr },
+ { X86::VCVTSS2SIZrm , X86::VCVTSS2SIrm },
+ { X86::VCVTSS2SIZrr , X86::VCVTSS2SIrr },
+ { X86::VCVTTSD2SI64Zrm , X86::VCVTTSD2SI64rm },
+ { X86::VCVTTSD2SI64Zrm_Int , X86::Int_VCVTTSD2SI64rm },
+ { X86::VCVTTSD2SI64Zrr , X86::VCVTTSD2SI64rr },
+ { X86::VCVTTSD2SI64Zrr_Int , X86::Int_VCVTTSD2SI64rr },
+ { X86::VCVTTSD2SIZrm , X86::VCVTTSD2SIrm },
+ { X86::VCVTTSD2SIZrm_Int , X86::Int_VCVTTSD2SIrm },
+ { X86::VCVTTSD2SIZrr , X86::VCVTTSD2SIrr },
+ { X86::VCVTTSD2SIZrr_Int , X86::Int_VCVTTSD2SIrr },
+ { X86::VCVTTSS2SI64Zrm , X86::VCVTTSS2SI64rm },
+ { X86::VCVTTSS2SI64Zrm_Int , X86::Int_VCVTTSS2SI64rm },
+ { X86::VCVTTSS2SI64Zrr , X86::VCVTTSS2SI64rr },
+ { X86::VCVTTSS2SI64Zrr_Int , X86::Int_VCVTTSS2SI64rr },
+ { X86::VCVTTSS2SIZrm , X86::VCVTTSS2SIrm },
+ { X86::VCVTTSS2SIZrm_Int , X86::Int_VCVTTSS2SIrm },
+ { X86::VCVTTSS2SIZrr , X86::VCVTTSS2SIrr },
+ { X86::VCVTTSS2SIZrr_Int , X86::Int_VCVTTSS2SIrr },
+ { X86::VDIVSDZrm , X86::VDIVSDrm },
+ { X86::VDIVSDZrm_Int , X86::VDIVSDrm_Int },
+ { X86::VDIVSDZrr , X86::VDIVSDrr },
+ { X86::VDIVSDZrr_Int , X86::VDIVSDrr_Int },
+ { X86::VDIVSSZrm , X86::VDIVSSrm },
+ { X86::VDIVSSZrm_Int , X86::VDIVSSrm_Int },
+ { X86::VDIVSSZrr , X86::VDIVSSrr },
+ { X86::VDIVSSZrr_Int , X86::VDIVSSrr_Int },
+ { X86::VFMADD132SDZm , X86::VFMADD132SDm },
+ { X86::VFMADD132SDZm_Int , X86::VFMADD132SDm_Int },
+ { X86::VFMADD132SDZr , X86::VFMADD132SDr },
+ { X86::VFMADD132SDZr_Int , X86::VFMADD132SDr_Int },
+ { X86::VFMADD132SSZm , X86::VFMADD132SSm },
+ { X86::VFMADD132SSZm_Int , X86::VFMADD132SSm_Int },
+ { X86::VFMADD132SSZr , X86::VFMADD132SSr },
+ { X86::VFMADD132SSZr_Int , X86::VFMADD132SSr_Int },
+ { X86::VFMADD213SDZm , X86::VFMADD213SDm },
+ { X86::VFMADD213SDZm_Int , X86::VFMADD213SDm_Int },
+ { X86::VFMADD213SDZr , X86::VFMADD213SDr },
+ { X86::VFMADD213SDZr_Int , X86::VFMADD213SDr_Int },
+ { X86::VFMADD213SSZm , X86::VFMADD213SSm },
+ { X86::VFMADD213SSZm_Int , X86::VFMADD213SSm_Int },
+ { X86::VFMADD213SSZr , X86::VFMADD213SSr },
+ { X86::VFMADD213SSZr_Int , X86::VFMADD213SSr_Int },
+ { X86::VFMADD231SDZm , X86::VFMADD231SDm },
+ { X86::VFMADD231SDZm_Int , X86::VFMADD231SDm_Int },
+ { X86::VFMADD231SDZr , X86::VFMADD231SDr },
+ { X86::VFMADD231SDZr_Int , X86::VFMADD231SDr_Int },
+ { X86::VFMADD231SSZm , X86::VFMADD231SSm },
+ { X86::VFMADD231SSZm_Int , X86::VFMADD231SSm_Int },
+ { X86::VFMADD231SSZr , X86::VFMADD231SSr },
+ { X86::VFMADD231SSZr_Int , X86::VFMADD231SSr_Int },
+ { X86::VFMSUB132SDZm , X86::VFMSUB132SDm },
+ { X86::VFMSUB132SDZm_Int , X86::VFMSUB132SDm_Int },
+ { X86::VFMSUB132SDZr , X86::VFMSUB132SDr },
+ { X86::VFMSUB132SDZr_Int , X86::VFMSUB132SDr_Int },
+ { X86::VFMSUB132SSZm , X86::VFMSUB132SSm },
+ { X86::VFMSUB132SSZm_Int , X86::VFMSUB132SSm_Int },
+ { X86::VFMSUB132SSZr , X86::VFMSUB132SSr },
+ { X86::VFMSUB132SSZr_Int , X86::VFMSUB132SSr_Int },
+ { X86::VFMSUB213SDZm , X86::VFMSUB213SDm },
+ { X86::VFMSUB213SDZm_Int , X86::VFMSUB213SDm_Int },
+ { X86::VFMSUB213SDZr , X86::VFMSUB213SDr },
+ { X86::VFMSUB213SDZr_Int , X86::VFMSUB213SDr_Int },
+ { X86::VFMSUB213SSZm , X86::VFMSUB213SSm },
+ { X86::VFMSUB213SSZm_Int , X86::VFMSUB213SSm_Int },
+ { X86::VFMSUB213SSZr , X86::VFMSUB213SSr },
+ { X86::VFMSUB213SSZr_Int , X86::VFMSUB213SSr_Int },
+ { X86::VFMSUB231SDZm , X86::VFMSUB231SDm },
+ { X86::VFMSUB231SDZm_Int , X86::VFMSUB231SDm_Int },
+ { X86::VFMSUB231SDZr , X86::VFMSUB231SDr },
+ { X86::VFMSUB231SDZr_Int , X86::VFMSUB231SDr_Int },
+ { X86::VFMSUB231SSZm , X86::VFMSUB231SSm },
+ { X86::VFMSUB231SSZm_Int , X86::VFMSUB231SSm_Int },
+ { X86::VFMSUB231SSZr , X86::VFMSUB231SSr },
+ { X86::VFMSUB231SSZr_Int , X86::VFMSUB231SSr_Int },
+ { X86::VFNMADD132SDZm , X86::VFNMADD132SDm },
+ { X86::VFNMADD132SDZm_Int , X86::VFNMADD132SDm_Int },
+ { X86::VFNMADD132SDZr , X86::VFNMADD132SDr },
+ { X86::VFNMADD132SDZr_Int , X86::VFNMADD132SDr_Int },
+ { X86::VFNMADD132SSZm , X86::VFNMADD132SSm },
+ { X86::VFNMADD132SSZm_Int , X86::VFNMADD132SSm_Int },
+ { X86::VFNMADD132SSZr , X86::VFNMADD132SSr },
+ { X86::VFNMADD132SSZr_Int , X86::VFNMADD132SSr_Int },
+ { X86::VFNMADD213SDZm , X86::VFNMADD213SDm },
+ { X86::VFNMADD213SDZm_Int , X86::VFNMADD213SDm_Int },
+ { X86::VFNMADD213SDZr , X86::VFNMADD213SDr },
+ { X86::VFNMADD213SDZr_Int , X86::VFNMADD213SDr_Int },
+ { X86::VFNMADD213SSZm , X86::VFNMADD213SSm },
+ { X86::VFNMADD213SSZm_Int , X86::VFNMADD213SSm_Int },
+ { X86::VFNMADD213SSZr , X86::VFNMADD213SSr },
+ { X86::VFNMADD213SSZr_Int , X86::VFNMADD213SSr_Int },
+ { X86::VFNMADD231SDZm , X86::VFNMADD231SDm },
+ { X86::VFNMADD231SDZm_Int , X86::VFNMADD231SDm_Int },
+ { X86::VFNMADD231SDZr , X86::VFNMADD231SDr },
+ { X86::VFNMADD231SDZr_Int , X86::VFNMADD231SDr_Int },
+ { X86::VFNMADD231SSZm , X86::VFNMADD231SSm },
+ { X86::VFNMADD231SSZm_Int , X86::VFNMADD231SSm_Int },
+ { X86::VFNMADD231SSZr , X86::VFNMADD231SSr },
+ { X86::VFNMADD231SSZr_Int , X86::VFNMADD231SSr_Int },
+ { X86::VFNMSUB132SDZm , X86::VFNMSUB132SDm },
+ { X86::VFNMSUB132SDZm_Int , X86::VFNMSUB132SDm_Int },
+ { X86::VFNMSUB132SDZr , X86::VFNMSUB132SDr },
+ { X86::VFNMSUB132SDZr_Int , X86::VFNMSUB132SDr_Int },
+ { X86::VFNMSUB132SSZm , X86::VFNMSUB132SSm },
+ { X86::VFNMSUB132SSZm_Int , X86::VFNMSUB132SSm_Int },
+ { X86::VFNMSUB132SSZr , X86::VFNMSUB132SSr },
+ { X86::VFNMSUB132SSZr_Int , X86::VFNMSUB132SSr_Int },
+ { X86::VFNMSUB213SDZm , X86::VFNMSUB213SDm },
+ { X86::VFNMSUB213SDZm_Int , X86::VFNMSUB213SDm_Int },
+ { X86::VFNMSUB213SDZr , X86::VFNMSUB213SDr },
+ { X86::VFNMSUB213SDZr_Int , X86::VFNMSUB213SDr_Int },
+ { X86::VFNMSUB213SSZm , X86::VFNMSUB213SSm },
+ { X86::VFNMSUB213SSZm_Int , X86::VFNMSUB213SSm_Int },
+ { X86::VFNMSUB213SSZr , X86::VFNMSUB213SSr },
+ { X86::VFNMSUB213SSZr_Int , X86::VFNMSUB213SSr_Int },
+ { X86::VFNMSUB231SDZm , X86::VFNMSUB231SDm },
+ { X86::VFNMSUB231SDZm_Int , X86::VFNMSUB231SDm_Int },
+ { X86::VFNMSUB231SDZr , X86::VFNMSUB231SDr },
+ { X86::VFNMSUB231SDZr_Int , X86::VFNMSUB231SDr_Int },
+ { X86::VFNMSUB231SSZm , X86::VFNMSUB231SSm },
+ { X86::VFNMSUB231SSZm_Int , X86::VFNMSUB231SSm_Int },
+ { X86::VFNMSUB231SSZr , X86::VFNMSUB231SSr },
+ { X86::VFNMSUB231SSZr_Int , X86::VFNMSUB231SSr_Int },
+ { X86::VMAXCSDZrm , X86::VMAXCSDrm },
+ { X86::VMAXCSDZrr , X86::VMAXCSDrr },
+ { X86::VMAXCSSZrm , X86::VMAXCSSrm },
+ { X86::VMAXCSSZrr , X86::VMAXCSSrr },
+ { X86::VMAXSDZrm , X86::VMAXSDrm },
+ { X86::VMAXSDZrm_Int , X86::VMAXSDrm_Int },
+ { X86::VMAXSDZrr , X86::VMAXSDrr },
+ { X86::VMAXSDZrr_Int , X86::VMAXSDrr_Int },
+ { X86::VMAXSSZrm , X86::VMAXSSrm },
+ { X86::VMAXSSZrm_Int , X86::VMAXSSrm_Int },
+ { X86::VMAXSSZrr , X86::VMAXSSrr },
+ { X86::VMAXSSZrr_Int , X86::VMAXSSrr_Int },
+ { X86::VMINCSDZrm , X86::VMINCSDrm },
+ { X86::VMINCSDZrr , X86::VMINCSDrr },
+ { X86::VMINCSSZrm , X86::VMINCSSrm },
+ { X86::VMINCSSZrr , X86::VMINCSSrr },
+ { X86::VMINSDZrm , X86::VMINSDrm },
+ { X86::VMINSDZrm_Int , X86::VMINSDrm_Int },
+ { X86::VMINSDZrr , X86::VMINSDrr },
+ { X86::VMINSDZrr_Int , X86::VMINSDrr_Int },
+ { X86::VMINSSZrm , X86::VMINSSrm },
+ { X86::VMINSSZrm_Int , X86::VMINSSrm_Int },
+ { X86::VMINSSZrr , X86::VMINSSrr },
+ { X86::VMINSSZrr_Int , X86::VMINSSrr_Int },
+ { X86::VMOV64toSDZrr , X86::VMOV64toSDrr },
+ { X86::VMOVDI2SSZrm , X86::VMOVDI2SSrm },
+ { X86::VMOVDI2SSZrr , X86::VMOVDI2SSrr },
+ { X86::VMOVSDZmr , X86::VMOVSDmr },
+ { X86::VMOVSDZrm , X86::VMOVSDrm },
+ { X86::VMOVSDZrr , X86::VMOVSDrr },
+ { X86::VMOVSSZmr , X86::VMOVSSmr },
+ { X86::VMOVSSZrm , X86::VMOVSSrm },
+ { X86::VMOVSSZrr , X86::VMOVSSrr },
+ { X86::VMOVSSZrr_REV , X86::VMOVSSrr_REV },
+ { X86::VMULSDZrm , X86::VMULSDrm },
+ { X86::VMULSDZrm_Int , X86::VMULSDrm_Int },
+ { X86::VMULSDZrr , X86::VMULSDrr },
+ { X86::VMULSDZrr_Int , X86::VMULSDrr_Int },
+ { X86::VMULSSZrm , X86::VMULSSrm },
+ { X86::VMULSSZrm_Int , X86::VMULSSrm_Int },
+ { X86::VMULSSZrr , X86::VMULSSrr },
+ { X86::VMULSSZrr_Int , X86::VMULSSrr_Int },
+ { X86::VSQRTSDZm , X86::VSQRTSDm },
+ { X86::VSQRTSDZm_Int , X86::VSQRTSDm_Int },
+ { X86::VSQRTSDZr , X86::VSQRTSDr },
+ { X86::VSQRTSDZr_Int , X86::VSQRTSDr_Int },
+ { X86::VSQRTSSZm , X86::VSQRTSSm },
+ { X86::VSQRTSSZm_Int , X86::VSQRTSSm_Int },
+ { X86::VSQRTSSZr , X86::VSQRTSSr },
+ { X86::VSQRTSSZr_Int , X86::VSQRTSSr_Int },
+ { X86::VSUBSDZrm , X86::VSUBSDrm },
+ { X86::VSUBSDZrm_Int , X86::VSUBSDrm_Int },
+ { X86::VSUBSDZrr , X86::VSUBSDrr },
+ { X86::VSUBSDZrr_Int , X86::VSUBSDrr_Int },
+ { X86::VSUBSSZrm , X86::VSUBSSrm },
+ { X86::VSUBSSZrm_Int , X86::VSUBSSrm_Int },
+ { X86::VSUBSSZrr , X86::VSUBSSrr },
+ { X86::VSUBSSZrr_Int , X86::VSUBSSrr_Int },
+ { X86::VUCOMISDZrm , X86::VUCOMISDrm },
+ { X86::VUCOMISDZrr , X86::VUCOMISDrr },
+ { X86::VUCOMISSZrm , X86::VUCOMISSrm },
+ { X86::VUCOMISSZrr , X86::VUCOMISSrr },
+
+ { X86::VMOV64toPQIZrr , X86::VMOV64toPQIrr },
+ { X86::VMOV64toSDZrr , X86::VMOV64toSDrr },
+ { X86::VMOVDI2PDIZrm , X86::VMOVDI2PDIrm },
+ { X86::VMOVDI2PDIZrr , X86::VMOVDI2PDIrr },
+ { X86::VMOVLHPSZrr , X86::VMOVLHPSrr },
+ { X86::VMOVHLPSZrr , X86::VMOVHLPSrr },
+ { X86::VMOVPDI2DIZmr , X86::VMOVPDI2DImr },
+ { X86::VMOVPDI2DIZrr , X86::VMOVPDI2DIrr },
+ { X86::VMOVPQI2QIZmr , X86::VMOVPQI2QImr },
+ { X86::VMOVPQIto64Zrr , X86::VMOVPQIto64rr },
+ { X86::VMOVQI2PQIZrm , X86::VMOVQI2PQIrm },
+ { X86::VMOVZPQILo2PQIZrr , X86::VMOVZPQILo2PQIrr },
+
+ { X86::VPEXTRBZmr , X86::VPEXTRBmr },
+ { X86::VPEXTRBZrr , X86::VPEXTRBrr },
+ { X86::VPEXTRDZmr , X86::VPEXTRDmr },
+ { X86::VPEXTRDZrr , X86::VPEXTRDrr },
+ { X86::VPEXTRQZmr , X86::VPEXTRQmr },
+ { X86::VPEXTRQZrr , X86::VPEXTRQrr },
+ { X86::VPEXTRWZmr , X86::VPEXTRWmr },
+ { X86::VPEXTRWZrr , X86::VPEXTRWri },
+
+ { X86::VPINSRBZrm , X86::VPINSRBrm },
+ { X86::VPINSRBZrr , X86::VPINSRBrr },
+ { X86::VPINSRDZrm , X86::VPINSRDrm },
+ { X86::VPINSRDZrr , X86::VPINSRDrr },
+ { X86::VPINSRQZrm , X86::VPINSRQrm },
+ { X86::VPINSRQZrr , X86::VPINSRQrr },
+ { X86::VPINSRWZrm , X86::VPINSRWrmi },
+ { X86::VPINSRWZrr , X86::VPINSRWrri },
+
+ // EVEX 128 with corresponding VEX.
+ { X86::VADDPDZ128rm , X86::VADDPDrm },
+ { X86::VADDPDZ128rr , X86::VADDPDrr },
+ { X86::VADDPSZ128rm , X86::VADDPSrm },
+ { X86::VADDPSZ128rr , X86::VADDPSrr },
+ { X86::VANDNPDZ128rm , X86::VANDNPDrm },
+ { X86::VANDNPDZ128rr , X86::VANDNPDrr },
+ { X86::VANDNPSZ128rm , X86::VANDNPSrm },
+ { X86::VANDNPSZ128rr , X86::VANDNPSrr },
+ { X86::VANDPDZ128rm , X86::VANDPDrm },
+ { X86::VANDPDZ128rr , X86::VANDPDrr },
+ { X86::VANDPSZ128rm , X86::VANDPSrm },
+ { X86::VANDPSZ128rr , X86::VANDPSrr },
+ { X86::VBROADCASTSSZ128m , X86::VBROADCASTSSrm },
+ { X86::VBROADCASTSSZ128r , X86::VBROADCASTSSrr },
+ { X86::VBROADCASTSSZ128r_s , X86::VBROADCASTSSrr },
+ { X86::VCVTDQ2PDZ128rm , X86::VCVTDQ2PDrm },
+ { X86::VCVTDQ2PDZ128rr , X86::VCVTDQ2PDrr },
+ { X86::VCVTDQ2PSZ128rm , X86::VCVTDQ2PSrm },
+ { X86::VCVTDQ2PSZ128rr , X86::VCVTDQ2PSrr },
+ { X86::VCVTPD2DQZ128rm , X86::VCVTPD2DQrm },
+ { X86::VCVTPD2DQZ128rr , X86::VCVTPD2DQrr },
+ { X86::VCVTPD2PSZ128rm , X86::VCVTPD2PSrm },
+ { X86::VCVTPD2PSZ128rr , X86::VCVTPD2PSrr },
+ { X86::VCVTPH2PSZ128rm , X86::VCVTPH2PSrm },
+ { X86::VCVTPH2PSZ128rr , X86::VCVTPH2PSrr },
+ { X86::VCVTPS2DQZ128rm , X86::VCVTPS2DQrm },
+ { X86::VCVTPS2DQZ128rr , X86::VCVTPS2DQrr },
+ { X86::VCVTPS2PDZ128rm , X86::VCVTPS2PDrm },
+ { X86::VCVTPS2PDZ128rr , X86::VCVTPS2PDrr },
+ { X86::VCVTPS2PHZ128mr , X86::VCVTPS2PHmr },
+ { X86::VCVTPS2PHZ128rr , X86::VCVTPS2PHrr },
+ { X86::VCVTTPD2DQZ128rm , X86::VCVTTPD2DQrm },
+ { X86::VCVTTPD2DQZ128rr , X86::VCVTTPD2DQrr },
+ { X86::VCVTTPS2DQZ128rm , X86::VCVTTPS2DQrm },
+ { X86::VCVTTPS2DQZ128rr , X86::VCVTTPS2DQrr },
+ { X86::VDIVPDZ128rm , X86::VDIVPDrm },
+ { X86::VDIVPDZ128rr , X86::VDIVPDrr },
+ { X86::VDIVPSZ128rm , X86::VDIVPSrm },
+ { X86::VDIVPSZ128rr , X86::VDIVPSrr },
+ { X86::VFMADD132PDZ128m , X86::VFMADD132PDm },
+ { X86::VFMADD132PDZ128r , X86::VFMADD132PDr },
+ { X86::VFMADD132PSZ128m , X86::VFMADD132PSm },
+ { X86::VFMADD132PSZ128r , X86::VFMADD132PSr },
+ { X86::VFMADD213PDZ128m , X86::VFMADD213PDm },
+ { X86::VFMADD213PDZ128r , X86::VFMADD213PDr },
+ { X86::VFMADD213PSZ128m , X86::VFMADD213PSm },
+ { X86::VFMADD213PSZ128r , X86::VFMADD213PSr },
+ { X86::VFMADD231PDZ128m , X86::VFMADD231PDm },
+ { X86::VFMADD231PDZ128r , X86::VFMADD231PDr },
+ { X86::VFMADD231PSZ128m , X86::VFMADD231PSm },
+ { X86::VFMADD231PSZ128r , X86::VFMADD231PSr },
+ { X86::VFMADDSUB132PDZ128m , X86::VFMADDSUB132PDm },
+ { X86::VFMADDSUB132PDZ128r , X86::VFMADDSUB132PDr },
+ { X86::VFMADDSUB132PSZ128m , X86::VFMADDSUB132PSm },
+ { X86::VFMADDSUB132PSZ128r , X86::VFMADDSUB132PSr },
+ { X86::VFMADDSUB213PDZ128m , X86::VFMADDSUB213PDm },
+ { X86::VFMADDSUB213PDZ128r , X86::VFMADDSUB213PDr },
+ { X86::VFMADDSUB213PSZ128m , X86::VFMADDSUB213PSm },
+ { X86::VFMADDSUB213PSZ128r , X86::VFMADDSUB213PSr },
+ { X86::VFMADDSUB231PDZ128m , X86::VFMADDSUB231PDm },
+ { X86::VFMADDSUB231PDZ128r , X86::VFMADDSUB231PDr },
+ { X86::VFMADDSUB231PSZ128m , X86::VFMADDSUB231PSm },
+ { X86::VFMADDSUB231PSZ128r , X86::VFMADDSUB231PSr },
+ { X86::VFMSUB132PDZ128m , X86::VFMSUB132PDm },
+ { X86::VFMSUB132PDZ128r , X86::VFMSUB132PDr },
+ { X86::VFMSUB132PSZ128m , X86::VFMSUB132PSm },
+ { X86::VFMSUB132PSZ128r , X86::VFMSUB132PSr },
+ { X86::VFMSUB213PDZ128m , X86::VFMSUB213PDm },
+ { X86::VFMSUB213PDZ128r , X86::VFMSUB213PDr },
+ { X86::VFMSUB213PSZ128m , X86::VFMSUB213PSm },
+ { X86::VFMSUB213PSZ128r , X86::VFMSUB213PSr },
+ { X86::VFMSUB231PDZ128m , X86::VFMSUB231PDm },
+ { X86::VFMSUB231PDZ128r , X86::VFMSUB231PDr },
+ { X86::VFMSUB231PSZ128m , X86::VFMSUB231PSm },
+ { X86::VFMSUB231PSZ128r , X86::VFMSUB231PSr },
+ { X86::VFMSUBADD132PDZ128m , X86::VFMSUBADD132PDm },
+ { X86::VFMSUBADD132PDZ128r , X86::VFMSUBADD132PDr },
+ { X86::VFMSUBADD132PSZ128m , X86::VFMSUBADD132PSm },
+ { X86::VFMSUBADD132PSZ128r , X86::VFMSUBADD132PSr },
+ { X86::VFMSUBADD213PDZ128m , X86::VFMSUBADD213PDm },
+ { X86::VFMSUBADD213PDZ128r , X86::VFMSUBADD213PDr },
+ { X86::VFMSUBADD213PSZ128m , X86::VFMSUBADD213PSm },
+ { X86::VFMSUBADD213PSZ128r , X86::VFMSUBADD213PSr },
+ { X86::VFMSUBADD231PDZ128m , X86::VFMSUBADD231PDm },
+ { X86::VFMSUBADD231PDZ128r , X86::VFMSUBADD231PDr },
+ { X86::VFMSUBADD231PSZ128m , X86::VFMSUBADD231PSm },
+ { X86::VFMSUBADD231PSZ128r , X86::VFMSUBADD231PSr },
+ { X86::VFNMADD132PDZ128m , X86::VFNMADD132PDm },
+ { X86::VFNMADD132PDZ128r , X86::VFNMADD132PDr },
+ { X86::VFNMADD132PSZ128m , X86::VFNMADD132PSm },
+ { X86::VFNMADD132PSZ128r , X86::VFNMADD132PSr },
+ { X86::VFNMADD213PDZ128m , X86::VFNMADD213PDm },
+ { X86::VFNMADD213PDZ128r , X86::VFNMADD213PDr },
+ { X86::VFNMADD213PSZ128m , X86::VFNMADD213PSm },
+ { X86::VFNMADD213PSZ128r , X86::VFNMADD213PSr },
+ { X86::VFNMADD231PDZ128m , X86::VFNMADD231PDm },
+ { X86::VFNMADD231PDZ128r , X86::VFNMADD231PDr },
+ { X86::VFNMADD231PSZ128m , X86::VFNMADD231PSm },
+ { X86::VFNMADD231PSZ128r , X86::VFNMADD231PSr },
+ { X86::VFNMSUB132PDZ128m , X86::VFNMSUB132PDm },
+ { X86::VFNMSUB132PDZ128r , X86::VFNMSUB132PDr },
+ { X86::VFNMSUB132PSZ128m , X86::VFNMSUB132PSm },
+ { X86::VFNMSUB132PSZ128r , X86::VFNMSUB132PSr },
+ { X86::VFNMSUB213PDZ128m , X86::VFNMSUB213PDm },
+ { X86::VFNMSUB213PDZ128r , X86::VFNMSUB213PDr },
+ { X86::VFNMSUB213PSZ128m , X86::VFNMSUB213PSm },
+ { X86::VFNMSUB213PSZ128r , X86::VFNMSUB213PSr },
+ { X86::VFNMSUB231PDZ128m , X86::VFNMSUB231PDm },
+ { X86::VFNMSUB231PDZ128r , X86::VFNMSUB231PDr },
+ { X86::VFNMSUB231PSZ128m , X86::VFNMSUB231PSm },
+ { X86::VFNMSUB231PSZ128r , X86::VFNMSUB231PSr },
+ { X86::VMAXCPDZ128rm , X86::VMAXCPDrm },
+ { X86::VMAXCPDZ128rr , X86::VMAXCPDrr },
+ { X86::VMAXCPSZ128rm , X86::VMAXCPSrm },
+ { X86::VMAXCPSZ128rr , X86::VMAXCPSrr },
+ { X86::VMAXPDZ128rm , X86::VMAXPDrm },
+ { X86::VMAXPDZ128rr , X86::VMAXPDrr },
+ { X86::VMAXPSZ128rm , X86::VMAXPSrm },
+ { X86::VMAXPSZ128rr , X86::VMAXPSrr },
+ { X86::VMINCPDZ128rm , X86::VMINCPDrm },
+ { X86::VMINCPDZ128rr , X86::VMINCPDrr },
+ { X86::VMINCPSZ128rm , X86::VMINCPSrm },
+ { X86::VMINCPSZ128rr , X86::VMINCPSrr },
+ { X86::VMINPDZ128rm , X86::VMINPDrm },
+ { X86::VMINPDZ128rr , X86::VMINPDrr },
+ { X86::VMINPSZ128rm , X86::VMINPSrm },
+ { X86::VMINPSZ128rr , X86::VMINPSrr },
+ { X86::VMOVAPDZ128mr , X86::VMOVAPDmr },
+ { X86::VMOVAPDZ128rm , X86::VMOVAPDrm },
+ { X86::VMOVAPDZ128rr , X86::VMOVAPDrr },
+ { X86::VMOVAPDZ128rr_REV , X86::VMOVAPDrr_REV },
+ { X86::VMOVAPSZ128mr , X86::VMOVAPSmr },
+ { X86::VMOVAPSZ128rm , X86::VMOVAPSrm },
+ { X86::VMOVAPSZ128rr , X86::VMOVAPSrr },
+ { X86::VMOVAPSZ128rr_REV , X86::VMOVAPSrr_REV },
+ { X86::VMOVDDUPZ128rm , X86::VMOVDDUPrm },
+ { X86::VMOVDDUPZ128rr , X86::VMOVDDUPrr },
+ { X86::VMOVDQA32Z128mr , X86::VMOVDQAmr },
+ { X86::VMOVDQA32Z128rm , X86::VMOVDQArm },
+ { X86::VMOVDQA32Z128rr , X86::VMOVDQArr },
+ { X86::VMOVDQA32Z128rr_REV , X86::VMOVDQArr_REV },
+ { X86::VMOVDQA64Z128mr , X86::VMOVDQAmr },
+ { X86::VMOVDQA64Z128rm , X86::VMOVDQArm },
+ { X86::VMOVDQA64Z128rr , X86::VMOVDQArr },
+ { X86::VMOVDQA64Z128rr_REV , X86::VMOVDQArr_REV },
+ { X86::VMOVDQU16Z128mr , X86::VMOVDQUmr },
+ { X86::VMOVDQU16Z128rm , X86::VMOVDQUrm },
+ { X86::VMOVDQU16Z128rr , X86::VMOVDQUrr },
+ { X86::VMOVDQU16Z128rr_REV , X86::VMOVDQUrr_REV },
+ { X86::VMOVDQU32Z128mr , X86::VMOVDQUmr },
+ { X86::VMOVDQU32Z128rm , X86::VMOVDQUrm },
+ { X86::VMOVDQU32Z128rr , X86::VMOVDQUrr },
+ { X86::VMOVDQU32Z128rr_REV , X86::VMOVDQUrr_REV },
+ { X86::VMOVDQU64Z128mr , X86::VMOVDQUmr },
+ { X86::VMOVDQU64Z128rm , X86::VMOVDQUrm },
+ { X86::VMOVDQU64Z128rr , X86::VMOVDQUrr },
+ { X86::VMOVDQU64Z128rr_REV , X86::VMOVDQUrr_REV },
+ { X86::VMOVDQU8Z128mr , X86::VMOVDQUmr },
+ { X86::VMOVDQU8Z128rm , X86::VMOVDQUrm },
+ { X86::VMOVDQU8Z128rr , X86::VMOVDQUrr },
+ { X86::VMOVDQU8Z128rr_REV , X86::VMOVDQUrr_REV },
+ { X86::VMOVHPDZ128mr , X86::VMOVHPDmr },
+ { X86::VMOVHPDZ128rm , X86::VMOVHPDrm },
+ { X86::VMOVHPSZ128mr , X86::VMOVHPSmr },
+ { X86::VMOVHPSZ128rm , X86::VMOVHPSrm },
+ { X86::VMOVLPDZ128mr , X86::VMOVLPDmr },
+ { X86::VMOVLPDZ128rm , X86::VMOVLPDrm },
+ { X86::VMOVLPSZ128mr , X86::VMOVLPSmr },
+ { X86::VMOVLPSZ128rm , X86::VMOVLPSrm },
+ { X86::VMOVNTDQAZ128rm , X86::VMOVNTDQArm },
+ { X86::VMOVNTDQZ128mr , X86::VMOVNTDQmr },
+ { X86::VMOVNTPDZ128mr , X86::VMOVNTPDmr },
+ { X86::VMOVNTPSZ128mr , X86::VMOVNTPSmr },
+ { X86::VMOVSHDUPZ128rm , X86::VMOVSHDUPrm },
+ { X86::VMOVSHDUPZ128rr , X86::VMOVSHDUPrr },
+ { X86::VMOVSLDUPZ128rm , X86::VMOVSLDUPrm },
+ { X86::VMOVSLDUPZ128rr , X86::VMOVSLDUPrr },
+ { X86::VMOVUPDZ128mr , X86::VMOVUPDmr },
+ { X86::VMOVUPDZ128rm , X86::VMOVUPDrm },
+ { X86::VMOVUPDZ128rr , X86::VMOVUPDrr },
+ { X86::VMOVUPDZ128rr_REV , X86::VMOVUPDrr_REV },
+ { X86::VMOVUPSZ128mr , X86::VMOVUPSmr },
+ { X86::VMOVUPSZ128rm , X86::VMOVUPSrm },
+ { X86::VMOVUPSZ128rr , X86::VMOVUPSrr },
+ { X86::VMOVUPSZ128rr_REV , X86::VMOVUPSrr_REV },
+ { X86::VMULPDZ128rm , X86::VMULPDrm },
+ { X86::VMULPDZ128rr , X86::VMULPDrr },
+ { X86::VMULPSZ128rm , X86::VMULPSrm },
+ { X86::VMULPSZ128rr , X86::VMULPSrr },
+ { X86::VORPDZ128rm , X86::VORPDrm },
+ { X86::VORPDZ128rr , X86::VORPDrr },
+ { X86::VORPSZ128rm , X86::VORPSrm },
+ { X86::VORPSZ128rr , X86::VORPSrr },
+ { X86::VPABSBZ128rm , X86::VPABSBrm },
+ { X86::VPABSBZ128rr , X86::VPABSBrr },
+ { X86::VPABSDZ128rm , X86::VPABSDrm },
+ { X86::VPABSDZ128rr , X86::VPABSDrr },
+ { X86::VPABSWZ128rm , X86::VPABSWrm },
+ { X86::VPABSWZ128rr , X86::VPABSWrr },
+ { X86::VPACKSSDWZ128rm , X86::VPACKSSDWrm },
+ { X86::VPACKSSDWZ128rr , X86::VPACKSSDWrr },
+ { X86::VPACKSSWBZ128rm , X86::VPACKSSWBrm },
+ { X86::VPACKSSWBZ128rr , X86::VPACKSSWBrr },
+ { X86::VPACKUSDWZ128rm , X86::VPACKUSDWrm },
+ { X86::VPACKUSDWZ128rr , X86::VPACKUSDWrr },
+ { X86::VPACKUSWBZ128rm , X86::VPACKUSWBrm },
+ { X86::VPACKUSWBZ128rr , X86::VPACKUSWBrr },
+ { X86::VPADDBZ128rm , X86::VPADDBrm },
+ { X86::VPADDBZ128rr , X86::VPADDBrr },
+ { X86::VPADDDZ128rm , X86::VPADDDrm },
+ { X86::VPADDDZ128rr , X86::VPADDDrr },
+ { X86::VPADDQZ128rm , X86::VPADDQrm },
+ { X86::VPADDQZ128rr , X86::VPADDQrr },
+ { X86::VPADDSBZ128rm , X86::VPADDSBrm },
+ { X86::VPADDSBZ128rr , X86::VPADDSBrr },
+ { X86::VPADDSWZ128rm , X86::VPADDSWrm },
+ { X86::VPADDSWZ128rr , X86::VPADDSWrr },
+ { X86::VPADDUSBZ128rm , X86::VPADDUSBrm },
+ { X86::VPADDUSBZ128rr , X86::VPADDUSBrr },
+ { X86::VPADDUSWZ128rm , X86::VPADDUSWrm },
+ { X86::VPADDUSWZ128rr , X86::VPADDUSWrr },
+ { X86::VPADDWZ128rm , X86::VPADDWrm },
+ { X86::VPADDWZ128rr , X86::VPADDWrr },
+ { X86::VPALIGNRZ128rmi , X86::VPALIGNRrmi },
+ { X86::VPALIGNRZ128rri , X86::VPALIGNRrri },
+ { X86::VPANDDZ128rm , X86::VPANDrm },
+ { X86::VPANDDZ128rr , X86::VPANDrr },
+ { X86::VPANDQZ128rm , X86::VPANDrm },
+ { X86::VPANDQZ128rr , X86::VPANDrr },
+ { X86::VPAVGBZ128rm , X86::VPAVGBrm },
+ { X86::VPAVGBZ128rr , X86::VPAVGBrr },
+ { X86::VPAVGWZ128rm , X86::VPAVGWrm },
+ { X86::VPAVGWZ128rr , X86::VPAVGWrr },
+ { X86::VPBROADCASTBZ128m , X86::VPBROADCASTBrm },
+ { X86::VPBROADCASTBZ128r , X86::VPBROADCASTBrr },
+ { X86::VPBROADCASTDZ128m , X86::VPBROADCASTDrm },
+ { X86::VPBROADCASTDZ128r , X86::VPBROADCASTDrr },
+ { X86::VPBROADCASTQZ128m , X86::VPBROADCASTQrm },
+ { X86::VPBROADCASTQZ128r , X86::VPBROADCASTQrr },
+ { X86::VPBROADCASTWZ128m , X86::VPBROADCASTWrm },
+ { X86::VPBROADCASTWZ128r , X86::VPBROADCASTWrr },
+ { X86::VPERMILPDZ128mi , X86::VPERMILPDmi },
+ { X86::VPERMILPDZ128ri , X86::VPERMILPDri },
+ { X86::VPERMILPDZ128rm , X86::VPERMILPDrm },
+ { X86::VPERMILPDZ128rr , X86::VPERMILPDrr },
+ { X86::VPERMILPSZ128mi , X86::VPERMILPSmi },
+ { X86::VPERMILPSZ128ri , X86::VPERMILPSri },
+ { X86::VPERMILPSZ128rm , X86::VPERMILPSrm },
+ { X86::VPERMILPSZ128rr , X86::VPERMILPSrr },
+ { X86::VPMADDUBSWZ128rm , X86::VPMADDUBSWrm },
+ { X86::VPMADDUBSWZ128rr , X86::VPMADDUBSWrr },
+ { X86::VPMADDWDZ128rm , X86::VPMADDWDrm },
+ { X86::VPMADDWDZ128rr , X86::VPMADDWDrr },
+ { X86::VPMAXSBZ128rm , X86::VPMAXSBrm },
+ { X86::VPMAXSBZ128rr , X86::VPMAXSBrr },
+ { X86::VPMAXSDZ128rm , X86::VPMAXSDrm },
+ { X86::VPMAXSDZ128rr , X86::VPMAXSDrr },
+ { X86::VPMAXSWZ128rm , X86::VPMAXSWrm },
+ { X86::VPMAXSWZ128rr , X86::VPMAXSWrr },
+ { X86::VPMAXUBZ128rm , X86::VPMAXUBrm },
+ { X86::VPMAXUBZ128rr , X86::VPMAXUBrr },
+ { X86::VPMAXUDZ128rm , X86::VPMAXUDrm },
+ { X86::VPMAXUDZ128rr , X86::VPMAXUDrr },
+ { X86::VPMAXUWZ128rm , X86::VPMAXUWrm },
+ { X86::VPMAXUWZ128rr , X86::VPMAXUWrr },
+ { X86::VPMINSBZ128rm , X86::VPMINSBrm },
+ { X86::VPMINSBZ128rr , X86::VPMINSBrr },
+ { X86::VPMINSDZ128rm , X86::VPMINSDrm },
+ { X86::VPMINSDZ128rr , X86::VPMINSDrr },
+ { X86::VPMINSWZ128rm , X86::VPMINSWrm },
+ { X86::VPMINSWZ128rr , X86::VPMINSWrr },
+ { X86::VPMINUBZ128rm , X86::VPMINUBrm },
+ { X86::VPMINUBZ128rr , X86::VPMINUBrr },
+ { X86::VPMINUDZ128rm , X86::VPMINUDrm },
+ { X86::VPMINUDZ128rr , X86::VPMINUDrr },
+ { X86::VPMINUWZ128rm , X86::VPMINUWrm },
+ { X86::VPMINUWZ128rr , X86::VPMINUWrr },
+ { X86::VPMOVSXBDZ128rm , X86::VPMOVSXBDrm },
+ { X86::VPMOVSXBDZ128rr , X86::VPMOVSXBDrr },
+ { X86::VPMOVSXBQZ128rm , X86::VPMOVSXBQrm },
+ { X86::VPMOVSXBQZ128rr , X86::VPMOVSXBQrr },
+ { X86::VPMOVSXBWZ128rm , X86::VPMOVSXBWrm },
+ { X86::VPMOVSXBWZ128rr , X86::VPMOVSXBWrr },
+ { X86::VPMOVSXDQZ128rm , X86::VPMOVSXDQrm },
+ { X86::VPMOVSXDQZ128rr , X86::VPMOVSXDQrr },
+ { X86::VPMOVSXWDZ128rm , X86::VPMOVSXWDrm },
+ { X86::VPMOVSXWDZ128rr , X86::VPMOVSXWDrr },
+ { X86::VPMOVSXWQZ128rm , X86::VPMOVSXWQrm },
+ { X86::VPMOVSXWQZ128rr , X86::VPMOVSXWQrr },
+ { X86::VPMOVZXBDZ128rm , X86::VPMOVZXBDrm },
+ { X86::VPMOVZXBDZ128rr , X86::VPMOVZXBDrr },
+ { X86::VPMOVZXBQZ128rm , X86::VPMOVZXBQrm },
+ { X86::VPMOVZXBQZ128rr , X86::VPMOVZXBQrr },
+ { X86::VPMOVZXBWZ128rm , X86::VPMOVZXBWrm },
+ { X86::VPMOVZXBWZ128rr , X86::VPMOVZXBWrr },
+ { X86::VPMOVZXDQZ128rm , X86::VPMOVZXDQrm },
+ { X86::VPMOVZXDQZ128rr , X86::VPMOVZXDQrr },
+ { X86::VPMOVZXWDZ128rm , X86::VPMOVZXWDrm },
+ { X86::VPMOVZXWDZ128rr , X86::VPMOVZXWDrr },
+ { X86::VPMOVZXWQZ128rm , X86::VPMOVZXWQrm },
+ { X86::VPMOVZXWQZ128rr , X86::VPMOVZXWQrr },
+ { X86::VPMULDQZ128rm , X86::VPMULDQrm },
+ { X86::VPMULDQZ128rr , X86::VPMULDQrr },
+ { X86::VPMULHRSWZ128rm , X86::VPMULHRSWrm },
+ { X86::VPMULHRSWZ128rr , X86::VPMULHRSWrr },
+ { X86::VPMULHUWZ128rm , X86::VPMULHUWrm },
+ { X86::VPMULHUWZ128rr , X86::VPMULHUWrr },
+ { X86::VPMULHWZ128rm , X86::VPMULHWrm },
+ { X86::VPMULHWZ128rr , X86::VPMULHWrr },
+ { X86::VPMULLDZ128rm , X86::VPMULLDrm },
+ { X86::VPMULLDZ128rr , X86::VPMULLDrr },
+ { X86::VPMULLWZ128rm , X86::VPMULLWrm },
+ { X86::VPMULLWZ128rr , X86::VPMULLWrr },
+ { X86::VPMULUDQZ128rm , X86::VPMULUDQrm },
+ { X86::VPMULUDQZ128rr , X86::VPMULUDQrr },
+ { X86::VPORDZ128rm , X86::VPORrm },
+ { X86::VPORDZ128rr , X86::VPORrr },
+ { X86::VPORQZ128rm , X86::VPORrm },
+ { X86::VPORQZ128rr , X86::VPORrr },
+ { X86::VPSADBWZ128rm , X86::VPSADBWrm },
+ { X86::VPSADBWZ128rr , X86::VPSADBWrr },
+ { X86::VPSHUFBZ128rm , X86::VPSHUFBrm },
+ { X86::VPSHUFBZ128rr , X86::VPSHUFBrr },
+ { X86::VPSHUFDZ128mi , X86::VPSHUFDmi },
+ { X86::VPSHUFDZ128ri , X86::VPSHUFDri },
+ { X86::VPSHUFHWZ128mi , X86::VPSHUFHWmi },
+ { X86::VPSHUFHWZ128ri , X86::VPSHUFHWri },
+ { X86::VPSHUFLWZ128mi , X86::VPSHUFLWmi },
+ { X86::VPSHUFLWZ128ri , X86::VPSHUFLWri },
+ { X86::VPSLLDQZ128rr , X86::VPSLLDQri },
+ { X86::VPSLLDZ128ri , X86::VPSLLDri },
+ { X86::VPSLLDZ128rm , X86::VPSLLDrm },
+ { X86::VPSLLDZ128rr , X86::VPSLLDrr },
+ { X86::VPSLLQZ128ri , X86::VPSLLQri },
+ { X86::VPSLLQZ128rm , X86::VPSLLQrm },
+ { X86::VPSLLQZ128rr , X86::VPSLLQrr },
+ { X86::VPSLLVDZ128rm , X86::VPSLLVDrm },
+ { X86::VPSLLVDZ128rr , X86::VPSLLVDrr },
+ { X86::VPSLLVQZ128rm , X86::VPSLLVQrm },
+ { X86::VPSLLVQZ128rr , X86::VPSLLVQrr },
+ { X86::VPSLLWZ128ri , X86::VPSLLWri },
+ { X86::VPSLLWZ128rm , X86::VPSLLWrm },
+ { X86::VPSLLWZ128rr , X86::VPSLLWrr },
+ { X86::VPSRADZ128ri , X86::VPSRADri },
+ { X86::VPSRADZ128rm , X86::VPSRADrm },
+ { X86::VPSRADZ128rr , X86::VPSRADrr },
+ { X86::VPSRAVDZ128rm , X86::VPSRAVDrm },
+ { X86::VPSRAVDZ128rr , X86::VPSRAVDrr },
+ { X86::VPSRAWZ128ri , X86::VPSRAWri },
+ { X86::VPSRAWZ128rm , X86::VPSRAWrm },
+ { X86::VPSRAWZ128rr , X86::VPSRAWrr },
+ { X86::VPSRLDQZ128rr , X86::VPSRLDQri },
+ { X86::VPSRLDZ128ri , X86::VPSRLDri },
+ { X86::VPSRLDZ128rm , X86::VPSRLDrm },
+ { X86::VPSRLDZ128rr , X86::VPSRLDrr },
+ { X86::VPSRLQZ128ri , X86::VPSRLQri },
+ { X86::VPSRLQZ128rm , X86::VPSRLQrm },
+ { X86::VPSRLQZ128rr , X86::VPSRLQrr },
+ { X86::VPSRLVDZ128rm , X86::VPSRLVDrm },
+ { X86::VPSRLVDZ128rr , X86::VPSRLVDrr },
+ { X86::VPSRLVQZ128rm , X86::VPSRLVQrm },
+ { X86::VPSRLVQZ128rr , X86::VPSRLVQrr },
+ { X86::VPSRLWZ128ri , X86::VPSRLWri },
+ { X86::VPSRLWZ128rm , X86::VPSRLWrm },
+ { X86::VPSRLWZ128rr , X86::VPSRLWrr },
+ { X86::VPSUBBZ128rm , X86::VPSUBBrm },
+ { X86::VPSUBBZ128rr , X86::VPSUBBrr },
+ { X86::VPSUBDZ128rm , X86::VPSUBDrm },
+ { X86::VPSUBDZ128rr , X86::VPSUBDrr },
+ { X86::VPSUBQZ128rm , X86::VPSUBQrm },
+ { X86::VPSUBQZ128rr , X86::VPSUBQrr },
+ { X86::VPSUBSBZ128rm , X86::VPSUBSBrm },
+ { X86::VPSUBSBZ128rr , X86::VPSUBSBrr },
+ { X86::VPSUBSWZ128rm , X86::VPSUBSWrm },
+ { X86::VPSUBSWZ128rr , X86::VPSUBSWrr },
+ { X86::VPSUBUSBZ128rm , X86::VPSUBUSBrm },
+ { X86::VPSUBUSBZ128rr , X86::VPSUBUSBrr },
+ { X86::VPSUBUSWZ128rm , X86::VPSUBUSWrm },
+ { X86::VPSUBUSWZ128rr , X86::VPSUBUSWrr },
+ { X86::VPSUBWZ128rm , X86::VPSUBWrm },
+ { X86::VPSUBWZ128rr , X86::VPSUBWrr },
+ { X86::VPUNPCKHBWZ128rm , X86::VPUNPCKHBWrm },
+ { X86::VPUNPCKHBWZ128rr , X86::VPUNPCKHBWrr },
+ { X86::VPUNPCKHDQZ128rm , X86::VPUNPCKHDQrm },
+ { X86::VPUNPCKHDQZ128rr , X86::VPUNPCKHDQrr },
+ { X86::VPUNPCKHQDQZ128rm , X86::VPUNPCKHQDQrm },
+ { X86::VPUNPCKHQDQZ128rr , X86::VPUNPCKHQDQrr },
+ { X86::VPUNPCKHWDZ128rm , X86::VPUNPCKHWDrm },
+ { X86::VPUNPCKHWDZ128rr , X86::VPUNPCKHWDrr },
+ { X86::VPUNPCKLBWZ128rm , X86::VPUNPCKLBWrm },
+ { X86::VPUNPCKLBWZ128rr , X86::VPUNPCKLBWrr },
+ { X86::VPUNPCKLDQZ128rm , X86::VPUNPCKLDQrm },
+ { X86::VPUNPCKLDQZ128rr , X86::VPUNPCKLDQrr },
+ { X86::VPUNPCKLQDQZ128rm , X86::VPUNPCKLQDQrm },
+ { X86::VPUNPCKLQDQZ128rr , X86::VPUNPCKLQDQrr },
+ { X86::VPUNPCKLWDZ128rm , X86::VPUNPCKLWDrm },
+ { X86::VPUNPCKLWDZ128rr , X86::VPUNPCKLWDrr },
+ { X86::VPXORDZ128rm , X86::VPXORrm },
+ { X86::VPXORDZ128rr , X86::VPXORrr },
+ { X86::VPXORQZ128rm , X86::VPXORrm },
+ { X86::VPXORQZ128rr , X86::VPXORrr },
+ { X86::VSHUFPDZ128rmi , X86::VSHUFPDrmi },
+ { X86::VSHUFPDZ128rri , X86::VSHUFPDrri },
+ { X86::VSHUFPSZ128rmi , X86::VSHUFPSrmi },
+ { X86::VSHUFPSZ128rri , X86::VSHUFPSrri },
+ { X86::VSQRTPDZ128m , X86::VSQRTPDm },
+ { X86::VSQRTPDZ128r , X86::VSQRTPDr },
+ { X86::VSQRTPSZ128m , X86::VSQRTPSm },
+ { X86::VSQRTPSZ128r , X86::VSQRTPSr },
+ { X86::VSUBPDZ128rm , X86::VSUBPDrm },
+ { X86::VSUBPDZ128rr , X86::VSUBPDrr },
+ { X86::VSUBPSZ128rm , X86::VSUBPSrm },
+ { X86::VSUBPSZ128rr , X86::VSUBPSrr },
+ { X86::VUNPCKHPDZ128rm , X86::VUNPCKHPDrm },
+ { X86::VUNPCKHPDZ128rr , X86::VUNPCKHPDrr },
+ { X86::VUNPCKHPSZ128rm , X86::VUNPCKHPSrm },
+ { X86::VUNPCKHPSZ128rr , X86::VUNPCKHPSrr },
+ { X86::VUNPCKLPDZ128rm , X86::VUNPCKLPDrm },
+ { X86::VUNPCKLPDZ128rr , X86::VUNPCKLPDrr },
+ { X86::VUNPCKLPSZ128rm , X86::VUNPCKLPSrm },
+ { X86::VUNPCKLPSZ128rr , X86::VUNPCKLPSrr },
+ { X86::VXORPDZ128rm , X86::VXORPDrm },
+ { X86::VXORPDZ128rr , X86::VXORPDrr },
+ { X86::VXORPSZ128rm , X86::VXORPSrm },
+ { X86::VXORPSZ128rr , X86::VXORPSrr },
+};
+
+
+// X86 EVEX encoded instructions that have a VEX 256 encoding
+// (table format: <EVEX opcode, VEX-256 opcode>).
+ static const X86EvexToVexCompressTableEntry X86EvexToVex256CompressTable[] = {
+ { X86::VADDPDZ256rm , X86::VADDPDYrm },
+ { X86::VADDPDZ256rr , X86::VADDPDYrr },
+ { X86::VADDPSZ256rm , X86::VADDPSYrm },
+ { X86::VADDPSZ256rr , X86::VADDPSYrr },
+ { X86::VANDNPDZ256rm , X86::VANDNPDYrm },
+ { X86::VANDNPDZ256rr , X86::VANDNPDYrr },
+ { X86::VANDNPSZ256rm , X86::VANDNPSYrm },
+ { X86::VANDNPSZ256rr , X86::VANDNPSYrr },
+ { X86::VANDPDZ256rm , X86::VANDPDYrm },
+ { X86::VANDPDZ256rr , X86::VANDPDYrr },
+ { X86::VANDPSZ256rm , X86::VANDPSYrm },
+ { X86::VANDPSZ256rr , X86::VANDPSYrr },
+ { X86::VBROADCASTSDZ256m , X86::VBROADCASTSDYrm },
+ { X86::VBROADCASTSDZ256r , X86::VBROADCASTSDYrr },
+ { X86::VBROADCASTSDZ256r_s , X86::VBROADCASTSDYrr },
+ { X86::VBROADCASTSSZ256m , X86::VBROADCASTSSYrm },
+ { X86::VBROADCASTSSZ256r , X86::VBROADCASTSSYrr },
+ { X86::VBROADCASTSSZ256r_s , X86::VBROADCASTSSYrr },
+ { X86::VCVTDQ2PDZ256rm , X86::VCVTDQ2PDYrm },
+ { X86::VCVTDQ2PDZ256rr , X86::VCVTDQ2PDYrr },
+ { X86::VCVTDQ2PSZ256rm , X86::VCVTDQ2PSYrm },
+ { X86::VCVTDQ2PSZ256rr , X86::VCVTDQ2PSYrr },
+ { X86::VCVTPD2DQZ256rm , X86::VCVTPD2DQYrm },
+ { X86::VCVTPD2DQZ256rr , X86::VCVTPD2DQYrr },
+ { X86::VCVTPD2PSZ256rm , X86::VCVTPD2PSYrm },
+ { X86::VCVTPD2PSZ256rr , X86::VCVTPD2PSYrr },
+ { X86::VCVTPH2PSZ256rm , X86::VCVTPH2PSYrm },
+ { X86::VCVTPH2PSZ256rr , X86::VCVTPH2PSYrr },
+ { X86::VCVTPS2DQZ256rm , X86::VCVTPS2DQYrm },
+ { X86::VCVTPS2DQZ256rr , X86::VCVTPS2DQYrr },
+ { X86::VCVTPS2PDZ256rm , X86::VCVTPS2PDYrm },
+ { X86::VCVTPS2PDZ256rr , X86::VCVTPS2PDYrr },
+ { X86::VCVTPS2PHZ256mr , X86::VCVTPS2PHYmr },
+ { X86::VCVTPS2PHZ256rr , X86::VCVTPS2PHYrr },
+ { X86::VCVTTPD2DQZ256rm , X86::VCVTTPD2DQYrm },
+ { X86::VCVTTPD2DQZ256rr , X86::VCVTTPD2DQYrr },
+ { X86::VCVTTPS2DQZ256rm , X86::VCVTTPS2DQYrm },
+ { X86::VCVTTPS2DQZ256rr , X86::VCVTTPS2DQYrr },
+ { X86::VDIVPDZ256rm , X86::VDIVPDYrm },
+ { X86::VDIVPDZ256rr , X86::VDIVPDYrr },
+ { X86::VDIVPSZ256rm , X86::VDIVPSYrm },
+ { X86::VDIVPSZ256rr , X86::VDIVPSYrr },
+ { X86::VEXTRACTF32x4Z256mr , X86::VEXTRACTF128mr },
+ { X86::VEXTRACTF64x2Z256mr , X86::VEXTRACTF128mr },
+ { X86::VEXTRACTF32x4Z256rr , X86::VEXTRACTF128rr },
+ { X86::VEXTRACTF64x2Z256rr , X86::VEXTRACTF128rr },
+ { X86::VEXTRACTI32x4Z256mr , X86::VEXTRACTI128mr },
+ { X86::VEXTRACTI64x2Z256mr , X86::VEXTRACTI128mr },
+ { X86::VEXTRACTI32x4Z256rr , X86::VEXTRACTI128rr },
+ { X86::VEXTRACTI64x2Z256rr , X86::VEXTRACTI128rr },
+ { X86::VFMADD132PDZ256m , X86::VFMADD132PDYm },
+ { X86::VFMADD132PDZ256r , X86::VFMADD132PDYr },
+ { X86::VFMADD132PSZ256m , X86::VFMADD132PSYm },
+ { X86::VFMADD132PSZ256r , X86::VFMADD132PSYr },
+ { X86::VFMADD213PDZ256m , X86::VFMADD213PDYm },
+ { X86::VFMADD213PDZ256r , X86::VFMADD213PDYr },
+ { X86::VFMADD213PSZ256m , X86::VFMADD213PSYm },
+ { X86::VFMADD213PSZ256r , X86::VFMADD213PSYr },
+ { X86::VFMADD231PDZ256m , X86::VFMADD231PDYm },
+ { X86::VFMADD231PDZ256r , X86::VFMADD231PDYr },
+ { X86::VFMADD231PSZ256m , X86::VFMADD231PSYm },
+ { X86::VFMADD231PSZ256r , X86::VFMADD231PSYr },
+ { X86::VFMADDSUB132PDZ256m , X86::VFMADDSUB132PDYm },
+ { X86::VFMADDSUB132PDZ256r , X86::VFMADDSUB132PDYr },
+ { X86::VFMADDSUB132PSZ256m , X86::VFMADDSUB132PSYm },
+ { X86::VFMADDSUB132PSZ256r , X86::VFMADDSUB132PSYr },
+ { X86::VFMADDSUB213PDZ256m , X86::VFMADDSUB213PDYm },
+ { X86::VFMADDSUB213PDZ256r , X86::VFMADDSUB213PDYr },
+ { X86::VFMADDSUB213PSZ256m , X86::VFMADDSUB213PSYm },
+ { X86::VFMADDSUB213PSZ256r , X86::VFMADDSUB213PSYr },
+ { X86::VFMADDSUB231PDZ256m , X86::VFMADDSUB231PDYm },
+ { X86::VFMADDSUB231PDZ256r , X86::VFMADDSUB231PDYr },
+ { X86::VFMADDSUB231PSZ256m , X86::VFMADDSUB231PSYm },
+ { X86::VFMADDSUB231PSZ256r , X86::VFMADDSUB231PSYr },
+ { X86::VFMSUB132PDZ256m , X86::VFMSUB132PDYm },
+ { X86::VFMSUB132PDZ256r , X86::VFMSUB132PDYr },
+ { X86::VFMSUB132PSZ256m , X86::VFMSUB132PSYm },
+ { X86::VFMSUB132PSZ256r , X86::VFMSUB132PSYr },
+ { X86::VFMSUB213PDZ256m , X86::VFMSUB213PDYm },
+ { X86::VFMSUB213PDZ256r , X86::VFMSUB213PDYr },
+ { X86::VFMSUB213PSZ256m , X86::VFMSUB213PSYm },
+ { X86::VFMSUB213PSZ256r , X86::VFMSUB213PSYr },
+ { X86::VFMSUB231PDZ256m , X86::VFMSUB231PDYm },
+ { X86::VFMSUB231PDZ256r , X86::VFMSUB231PDYr },
+ { X86::VFMSUB231PSZ256m , X86::VFMSUB231PSYm },
+ { X86::VFMSUB231PSZ256r , X86::VFMSUB231PSYr },
+ { X86::VFMSUBADD132PDZ256m , X86::VFMSUBADD132PDYm },
+ { X86::VFMSUBADD132PDZ256r , X86::VFMSUBADD132PDYr },
+ { X86::VFMSUBADD132PSZ256m , X86::VFMSUBADD132PSYm },
+ { X86::VFMSUBADD132PSZ256r , X86::VFMSUBADD132PSYr },
+ { X86::VFMSUBADD213PDZ256m , X86::VFMSUBADD213PDYm },
+ { X86::VFMSUBADD213PDZ256r , X86::VFMSUBADD213PDYr },
+ { X86::VFMSUBADD213PSZ256m , X86::VFMSUBADD213PSYm },
+ { X86::VFMSUBADD213PSZ256r , X86::VFMSUBADD213PSYr },
+ { X86::VFMSUBADD231PDZ256m , X86::VFMSUBADD231PDYm },
+ { X86::VFMSUBADD231PDZ256r , X86::VFMSUBADD231PDYr },
+ { X86::VFMSUBADD231PSZ256m , X86::VFMSUBADD231PSYm },
+ { X86::VFMSUBADD231PSZ256r , X86::VFMSUBADD231PSYr },
+ { X86::VFNMADD132PDZ256m , X86::VFNMADD132PDYm },
+ { X86::VFNMADD132PDZ256r , X86::VFNMADD132PDYr },
+ { X86::VFNMADD132PSZ256m , X86::VFNMADD132PSYm },
+ { X86::VFNMADD132PSZ256r , X86::VFNMADD132PSYr },
+ { X86::VFNMADD213PDZ256m , X86::VFNMADD213PDYm },
+ { X86::VFNMADD213PDZ256r , X86::VFNMADD213PDYr },
+ { X86::VFNMADD213PSZ256m , X86::VFNMADD213PSYm },
+ { X86::VFNMADD213PSZ256r , X86::VFNMADD213PSYr },
+ { X86::VFNMADD231PDZ256m , X86::VFNMADD231PDYm },
+ { X86::VFNMADD231PDZ256r , X86::VFNMADD231PDYr },
+ { X86::VFNMADD231PSZ256m , X86::VFNMADD231PSYm },
+ { X86::VFNMADD231PSZ256r , X86::VFNMADD231PSYr },
+ { X86::VFNMSUB132PDZ256m , X86::VFNMSUB132PDYm },
+ { X86::VFNMSUB132PDZ256r , X86::VFNMSUB132PDYr },
+ { X86::VFNMSUB132PSZ256m , X86::VFNMSUB132PSYm },
+ { X86::VFNMSUB132PSZ256r , X86::VFNMSUB132PSYr },
+ { X86::VFNMSUB213PDZ256m , X86::VFNMSUB213PDYm },
+ { X86::VFNMSUB213PDZ256r , X86::VFNMSUB213PDYr },
+ { X86::VFNMSUB213PSZ256m , X86::VFNMSUB213PSYm },
+ { X86::VFNMSUB213PSZ256r , X86::VFNMSUB213PSYr },
+ { X86::VFNMSUB231PDZ256m , X86::VFNMSUB231PDYm },
+ { X86::VFNMSUB231PDZ256r , X86::VFNMSUB231PDYr },
+ { X86::VFNMSUB231PSZ256m , X86::VFNMSUB231PSYm },
+ { X86::VFNMSUB231PSZ256r , X86::VFNMSUB231PSYr },
+ { X86::VINSERTF32x4Z256rm , X86::VINSERTF128rm },
+ { X86::VINSERTF64x2Z256rm , X86::VINSERTF128rm },
+ { X86::VINSERTF32x4Z256rr , X86::VINSERTF128rr },
+ { X86::VINSERTF64x2Z256rr , X86::VINSERTF128rr },
+ { X86::VINSERTI32x4Z256rm , X86::VINSERTI128rm },
+ { X86::VINSERTI64x2Z256rm , X86::VINSERTI128rm },
+ { X86::VINSERTI32x4Z256rr , X86::VINSERTI128rr },
+ { X86::VINSERTI64x2Z256rr , X86::VINSERTI128rr },
+ { X86::VMAXCPDZ256rm , X86::VMAXCPDYrm },
+ { X86::VMAXCPDZ256rr , X86::VMAXCPDYrr },
+ { X86::VMAXCPSZ256rm , X86::VMAXCPSYrm },
+ { X86::VMAXCPSZ256rr , X86::VMAXCPSYrr },
+ { X86::VMAXPDZ256rm , X86::VMAXPDYrm },
+ { X86::VMAXPDZ256rr , X86::VMAXPDYrr },
+ { X86::VMAXPSZ256rm , X86::VMAXPSYrm },
+ { X86::VMAXPSZ256rr , X86::VMAXPSYrr },
+ { X86::VMINCPDZ256rm , X86::VMINCPDYrm },
+ { X86::VMINCPDZ256rr , X86::VMINCPDYrr },
+ { X86::VMINCPSZ256rm , X86::VMINCPSYrm },
+ { X86::VMINCPSZ256rr , X86::VMINCPSYrr },
+ { X86::VMINPDZ256rm , X86::VMINPDYrm },
+ { X86::VMINPDZ256rr , X86::VMINPDYrr },
+ { X86::VMINPSZ256rm , X86::VMINPSYrm },
+ { X86::VMINPSZ256rr , X86::VMINPSYrr },
+ { X86::VMOVAPDZ256mr , X86::VMOVAPDYmr },
+ { X86::VMOVAPDZ256rm , X86::VMOVAPDYrm },
+ { X86::VMOVAPDZ256rr , X86::VMOVAPDYrr },
+ { X86::VMOVAPDZ256rr_REV , X86::VMOVAPDYrr_REV },
+ { X86::VMOVAPSZ256mr , X86::VMOVAPSYmr },
+ { X86::VMOVAPSZ256rm , X86::VMOVAPSYrm },
+ { X86::VMOVAPSZ256rr , X86::VMOVAPSYrr },
+ { X86::VMOVAPSZ256rr_REV , X86::VMOVAPSYrr_REV },
+ { X86::VMOVDDUPZ256rm , X86::VMOVDDUPYrm },
+ { X86::VMOVDDUPZ256rr , X86::VMOVDDUPYrr },
+ { X86::VMOVDQA32Z256mr , X86::VMOVDQAYmr },
+ { X86::VMOVDQA32Z256rm , X86::VMOVDQAYrm },
+ { X86::VMOVDQA32Z256rr , X86::VMOVDQAYrr },
+ { X86::VMOVDQA32Z256rr_REV , X86::VMOVDQAYrr_REV },
+ { X86::VMOVDQA64Z256mr , X86::VMOVDQAYmr },
+ { X86::VMOVDQA64Z256rm , X86::VMOVDQAYrm },
+ { X86::VMOVDQA64Z256rr , X86::VMOVDQAYrr },
+ { X86::VMOVDQA64Z256rr_REV , X86::VMOVDQAYrr_REV },
+ { X86::VMOVDQU16Z256mr , X86::VMOVDQUYmr },
+ { X86::VMOVDQU16Z256rm , X86::VMOVDQUYrm },
+ { X86::VMOVDQU16Z256rr , X86::VMOVDQUYrr },
+ { X86::VMOVDQU16Z256rr_REV , X86::VMOVDQUYrr_REV },
+ { X86::VMOVDQU32Z256mr , X86::VMOVDQUYmr },
+ { X86::VMOVDQU32Z256rm , X86::VMOVDQUYrm },
+ { X86::VMOVDQU32Z256rr , X86::VMOVDQUYrr },
+ { X86::VMOVDQU32Z256rr_REV , X86::VMOVDQUYrr_REV },
+ { X86::VMOVDQU64Z256mr , X86::VMOVDQUYmr },
+ { X86::VMOVDQU64Z256rm , X86::VMOVDQUYrm },
+ { X86::VMOVDQU64Z256rr , X86::VMOVDQUYrr },
+ { X86::VMOVDQU64Z256rr_REV , X86::VMOVDQUYrr_REV },
+ { X86::VMOVDQU8Z256mr , X86::VMOVDQUYmr },
+ { X86::VMOVDQU8Z256rm , X86::VMOVDQUYrm },
+ { X86::VMOVDQU8Z256rr , X86::VMOVDQUYrr },
+ { X86::VMOVDQU8Z256rr_REV , X86::VMOVDQUYrr_REV },
+ { X86::VMOVNTDQAZ256rm , X86::VMOVNTDQAYrm },
+ { X86::VMOVNTDQZ256mr , X86::VMOVNTDQYmr },
+ { X86::VMOVNTPDZ256mr , X86::VMOVNTPDYmr },
+ { X86::VMOVNTPSZ256mr , X86::VMOVNTPSYmr },
+ { X86::VMOVSHDUPZ256rm , X86::VMOVSHDUPYrm },
+ { X86::VMOVSHDUPZ256rr , X86::VMOVSHDUPYrr },
+ { X86::VMOVSLDUPZ256rm , X86::VMOVSLDUPYrm },
+ { X86::VMOVSLDUPZ256rr , X86::VMOVSLDUPYrr },
+ { X86::VMOVUPDZ256mr , X86::VMOVUPDYmr },
+ { X86::VMOVUPDZ256rm , X86::VMOVUPDYrm },
+ { X86::VMOVUPDZ256rr , X86::VMOVUPDYrr },
+ { X86::VMOVUPDZ256rr_REV , X86::VMOVUPDYrr_REV },
+ { X86::VMOVUPSZ256mr , X86::VMOVUPSYmr },
+ { X86::VMOVUPSZ256rm , X86::VMOVUPSYrm },
+ { X86::VMOVUPSZ256rr , X86::VMOVUPSYrr },
+ { X86::VMOVUPSZ256rr_REV , X86::VMOVUPSYrr_REV },
+ { X86::VMULPDZ256rm , X86::VMULPDYrm },
+ { X86::VMULPDZ256rr , X86::VMULPDYrr },
+ { X86::VMULPSZ256rm , X86::VMULPSYrm },
+ { X86::VMULPSZ256rr , X86::VMULPSYrr },
+ { X86::VORPDZ256rm , X86::VORPDYrm },
+ { X86::VORPDZ256rr , X86::VORPDYrr },
+ { X86::VORPSZ256rm , X86::VORPSYrm },
+ { X86::VORPSZ256rr , X86::VORPSYrr },
+ { X86::VPABSBZ256rm , X86::VPABSBYrm },
+ { X86::VPABSBZ256rr , X86::VPABSBYrr },
+ { X86::VPABSDZ256rm , X86::VPABSDYrm },
+ { X86::VPABSDZ256rr , X86::VPABSDYrr },
+ { X86::VPABSWZ256rm , X86::VPABSWYrm },
+ { X86::VPABSWZ256rr , X86::VPABSWYrr },
+ { X86::VPACKSSDWZ256rm , X86::VPACKSSDWYrm },
+ { X86::VPACKSSDWZ256rr , X86::VPACKSSDWYrr },
+ { X86::VPACKSSWBZ256rm , X86::VPACKSSWBYrm },
+ { X86::VPACKSSWBZ256rr , X86::VPACKSSWBYrr },
+ { X86::VPACKUSDWZ256rm , X86::VPACKUSDWYrm },
+ { X86::VPACKUSDWZ256rr , X86::VPACKUSDWYrr },
+ { X86::VPACKUSWBZ256rm , X86::VPACKUSWBYrm },
+ { X86::VPACKUSWBZ256rr , X86::VPACKUSWBYrr },
+ { X86::VPADDBZ256rm , X86::VPADDBYrm },
+ { X86::VPADDBZ256rr , X86::VPADDBYrr },
+ { X86::VPADDDZ256rm , X86::VPADDDYrm },
+ { X86::VPADDDZ256rr , X86::VPADDDYrr },
+ { X86::VPADDQZ256rm , X86::VPADDQYrm },
+ { X86::VPADDQZ256rr , X86::VPADDQYrr },
+ { X86::VPADDSBZ256rm , X86::VPADDSBYrm },
+ { X86::VPADDSBZ256rr , X86::VPADDSBYrr },
+ { X86::VPADDSWZ256rm , X86::VPADDSWYrm },
+ { X86::VPADDSWZ256rr , X86::VPADDSWYrr },
+ { X86::VPADDUSBZ256rm , X86::VPADDUSBYrm },
+ { X86::VPADDUSBZ256rr , X86::VPADDUSBYrr },
+ { X86::VPADDUSWZ256rm , X86::VPADDUSWYrm },
+ { X86::VPADDUSWZ256rr , X86::VPADDUSWYrr },
+ { X86::VPADDWZ256rm , X86::VPADDWYrm },
+ { X86::VPADDWZ256rr , X86::VPADDWYrr },
+ { X86::VPALIGNRZ256rmi , X86::VPALIGNRYrmi },
+ { X86::VPALIGNRZ256rri , X86::VPALIGNRYrri },
+ { X86::VPANDDZ256rm , X86::VPANDYrm },
+ { X86::VPANDDZ256rr , X86::VPANDYrr },
+ { X86::VPANDQZ256rm , X86::VPANDYrm },
+ { X86::VPANDQZ256rr , X86::VPANDYrr },
+ { X86::VPAVGBZ256rm , X86::VPAVGBYrm },
+ { X86::VPAVGBZ256rr , X86::VPAVGBYrr },
+ { X86::VPAVGWZ256rm , X86::VPAVGWYrm },
+ { X86::VPAVGWZ256rr , X86::VPAVGWYrr },
+ { X86::VPBROADCASTBZ256m , X86::VPBROADCASTBYrm },
+ { X86::VPBROADCASTBZ256r , X86::VPBROADCASTBYrr },
+ { X86::VPBROADCASTDZ256m , X86::VPBROADCASTDYrm },
+ { X86::VPBROADCASTDZ256r , X86::VPBROADCASTDYrr },
+ { X86::VPBROADCASTQZ256m , X86::VPBROADCASTQYrm },
+ { X86::VPBROADCASTQZ256r , X86::VPBROADCASTQYrr },
+ { X86::VPBROADCASTWZ256m , X86::VPBROADCASTWYrm },
+ { X86::VPBROADCASTWZ256r , X86::VPBROADCASTWYrr },
+ { X86::VPERMDZ256rm , X86::VPERMDYrm },
+ { X86::VPERMDZ256rr , X86::VPERMDYrr },
+ { X86::VPERMILPDZ256mi , X86::VPERMILPDYmi },
+ { X86::VPERMILPDZ256ri , X86::VPERMILPDYri },
+ { X86::VPERMILPDZ256rm , X86::VPERMILPDYrm },
+ { X86::VPERMILPDZ256rr , X86::VPERMILPDYrr },
+ { X86::VPERMILPSZ256mi , X86::VPERMILPSYmi },
+ { X86::VPERMILPSZ256ri , X86::VPERMILPSYri },
+ { X86::VPERMILPSZ256rm , X86::VPERMILPSYrm },
+ { X86::VPERMILPSZ256rr , X86::VPERMILPSYrr },
+ { X86::VPERMPDZ256mi , X86::VPERMPDYmi },
+ { X86::VPERMPDZ256ri , X86::VPERMPDYri },
+ { X86::VPERMPSZ256rm , X86::VPERMPSYrm },
+ { X86::VPERMPSZ256rr , X86::VPERMPSYrr },
+ { X86::VPERMQZ256mi , X86::VPERMQYmi },
+ { X86::VPERMQZ256ri , X86::VPERMQYri },
+ { X86::VPMADDUBSWZ256rm , X86::VPMADDUBSWYrm },
+ { X86::VPMADDUBSWZ256rr , X86::VPMADDUBSWYrr },
+ { X86::VPMADDWDZ256rm , X86::VPMADDWDYrm },
+ { X86::VPMADDWDZ256rr , X86::VPMADDWDYrr },
+ { X86::VPMAXSBZ256rm , X86::VPMAXSBYrm },
+ { X86::VPMAXSBZ256rr , X86::VPMAXSBYrr },
+ { X86::VPMAXSDZ256rm , X86::VPMAXSDYrm },
+ { X86::VPMAXSDZ256rr , X86::VPMAXSDYrr },
+ { X86::VPMAXSWZ256rm , X86::VPMAXSWYrm },
+ { X86::VPMAXSWZ256rr , X86::VPMAXSWYrr },
+ { X86::VPMAXUBZ256rm , X86::VPMAXUBYrm },
+ { X86::VPMAXUBZ256rr , X86::VPMAXUBYrr },
+ { X86::VPMAXUDZ256rm , X86::VPMAXUDYrm },
+ { X86::VPMAXUDZ256rr , X86::VPMAXUDYrr },
+ { X86::VPMAXUWZ256rm , X86::VPMAXUWYrm },
+ { X86::VPMAXUWZ256rr , X86::VPMAXUWYrr },
+ { X86::VPMINSBZ256rm , X86::VPMINSBYrm },
+ { X86::VPMINSBZ256rr , X86::VPMINSBYrr },
+ { X86::VPMINSDZ256rm , X86::VPMINSDYrm },
+ { X86::VPMINSDZ256rr , X86::VPMINSDYrr },
+ { X86::VPMINSWZ256rm , X86::VPMINSWYrm },
+ { X86::VPMINSWZ256rr , X86::VPMINSWYrr },
+ { X86::VPMINUBZ256rm , X86::VPMINUBYrm },
+ { X86::VPMINUBZ256rr , X86::VPMINUBYrr },
+ { X86::VPMINUDZ256rm , X86::VPMINUDYrm },
+ { X86::VPMINUDZ256rr , X86::VPMINUDYrr },
+ { X86::VPMINUWZ256rm , X86::VPMINUWYrm },
+ { X86::VPMINUWZ256rr , X86::VPMINUWYrr },
+ { X86::VPMOVSXBDZ256rm , X86::VPMOVSXBDYrm },
+ { X86::VPMOVSXBDZ256rr , X86::VPMOVSXBDYrr },
+ { X86::VPMOVSXBQZ256rm , X86::VPMOVSXBQYrm },
+ { X86::VPMOVSXBQZ256rr , X86::VPMOVSXBQYrr },
+ { X86::VPMOVSXBWZ256rm , X86::VPMOVSXBWYrm },
+ { X86::VPMOVSXBWZ256rr , X86::VPMOVSXBWYrr },
+ { X86::VPMOVSXDQZ256rm , X86::VPMOVSXDQYrm },
+ { X86::VPMOVSXDQZ256rr , X86::VPMOVSXDQYrr },
+ { X86::VPMOVSXWDZ256rm , X86::VPMOVSXWDYrm },
+ { X86::VPMOVSXWDZ256rr , X86::VPMOVSXWDYrr },
+ { X86::VPMOVSXWQZ256rm , X86::VPMOVSXWQYrm },
+ { X86::VPMOVSXWQZ256rr , X86::VPMOVSXWQYrr },
+ { X86::VPMOVZXBDZ256rm , X86::VPMOVZXBDYrm },
+ { X86::VPMOVZXBDZ256rr , X86::VPMOVZXBDYrr },
+ { X86::VPMOVZXBQZ256rm , X86::VPMOVZXBQYrm },
+ { X86::VPMOVZXBQZ256rr , X86::VPMOVZXBQYrr },
+ { X86::VPMOVZXBWZ256rm , X86::VPMOVZXBWYrm },
+ { X86::VPMOVZXBWZ256rr , X86::VPMOVZXBWYrr },
+ { X86::VPMOVZXDQZ256rm , X86::VPMOVZXDQYrm },
+ { X86::VPMOVZXDQZ256rr , X86::VPMOVZXDQYrr },
+ { X86::VPMOVZXWDZ256rm , X86::VPMOVZXWDYrm },
+ { X86::VPMOVZXWDZ256rr , X86::VPMOVZXWDYrr },
+ { X86::VPMOVZXWQZ256rm , X86::VPMOVZXWQYrm },
+ { X86::VPMOVZXWQZ256rr , X86::VPMOVZXWQYrr },
+ { X86::VPMULDQZ256rm , X86::VPMULDQYrm },
+ { X86::VPMULDQZ256rr , X86::VPMULDQYrr },
+ { X86::VPMULHRSWZ256rm , X86::VPMULHRSWYrm },
+ { X86::VPMULHRSWZ256rr , X86::VPMULHRSWYrr },
+ { X86::VPMULHUWZ256rm , X86::VPMULHUWYrm },
+ { X86::VPMULHUWZ256rr , X86::VPMULHUWYrr },
+ { X86::VPMULHWZ256rm , X86::VPMULHWYrm },
+ { X86::VPMULHWZ256rr , X86::VPMULHWYrr },
+ { X86::VPMULLDZ256rm , X86::VPMULLDYrm },
+ { X86::VPMULLDZ256rr , X86::VPMULLDYrr },
+ { X86::VPMULLWZ256rm , X86::VPMULLWYrm },
+ { X86::VPMULLWZ256rr , X86::VPMULLWYrr },
+ { X86::VPMULUDQZ256rm , X86::VPMULUDQYrm },
+ { X86::VPMULUDQZ256rr , X86::VPMULUDQYrr },
+ { X86::VPORDZ256rm , X86::VPORYrm },
+ { X86::VPORDZ256rr , X86::VPORYrr },
+ { X86::VPORQZ256rm , X86::VPORYrm },
+ { X86::VPORQZ256rr , X86::VPORYrr },
+ { X86::VPSADBWZ256rm , X86::VPSADBWYrm },
+ { X86::VPSADBWZ256rr , X86::VPSADBWYrr },
+ { X86::VPSHUFBZ256rm , X86::VPSHUFBYrm },
+ { X86::VPSHUFBZ256rr , X86::VPSHUFBYrr },
+ { X86::VPSHUFDZ256mi , X86::VPSHUFDYmi },
+ { X86::VPSHUFDZ256ri , X86::VPSHUFDYri },
+ { X86::VPSHUFHWZ256mi , X86::VPSHUFHWYmi },
+ { X86::VPSHUFHWZ256ri , X86::VPSHUFHWYri },
+ { X86::VPSHUFLWZ256mi , X86::VPSHUFLWYmi },
+ { X86::VPSHUFLWZ256ri , X86::VPSHUFLWYri },
+ { X86::VPSLLDQZ256rr , X86::VPSLLDQYri },
+ { X86::VPSLLDZ256ri , X86::VPSLLDYri },
+ { X86::VPSLLDZ256rm , X86::VPSLLDYrm },
+ { X86::VPSLLDZ256rr , X86::VPSLLDYrr },
+ { X86::VPSLLQZ256ri , X86::VPSLLQYri },
+ { X86::VPSLLQZ256rm , X86::VPSLLQYrm },
+ { X86::VPSLLQZ256rr , X86::VPSLLQYrr },
+ { X86::VPSLLVDZ256rm , X86::VPSLLVDYrm },
+ { X86::VPSLLVDZ256rr , X86::VPSLLVDYrr },
+ { X86::VPSLLVQZ256rm , X86::VPSLLVQYrm },
+ { X86::VPSLLVQZ256rr , X86::VPSLLVQYrr },
+ { X86::VPSLLWZ256ri , X86::VPSLLWYri },
+ { X86::VPSLLWZ256rm , X86::VPSLLWYrm },
+ { X86::VPSLLWZ256rr , X86::VPSLLWYrr },
+ { X86::VPSRADZ256ri , X86::VPSRADYri },
+ { X86::VPSRADZ256rm , X86::VPSRADYrm },
+ { X86::VPSRADZ256rr , X86::VPSRADYrr },
+ { X86::VPSRAVDZ256rm , X86::VPSRAVDYrm },
+ { X86::VPSRAVDZ256rr , X86::VPSRAVDYrr },
+ { X86::VPSRAWZ256ri , X86::VPSRAWYri },
+ { X86::VPSRAWZ256rm , X86::VPSRAWYrm },
+ { X86::VPSRAWZ256rr , X86::VPSRAWYrr },
+ { X86::VPSRLDQZ256rr , X86::VPSRLDQYri },
+ { X86::VPSRLDZ256ri , X86::VPSRLDYri },
+ { X86::VPSRLDZ256rm , X86::VPSRLDYrm },
+ { X86::VPSRLDZ256rr , X86::VPSRLDYrr },
+ { X86::VPSRLQZ256ri , X86::VPSRLQYri },
+ { X86::VPSRLQZ256rm , X86::VPSRLQYrm },
+ { X86::VPSRLQZ256rr , X86::VPSRLQYrr },
+ { X86::VPSRLVDZ256rm , X86::VPSRLVDYrm },
+ { X86::VPSRLVDZ256rr , X86::VPSRLVDYrr },
+ { X86::VPSRLVQZ256rm , X86::VPSRLVQYrm },
+ { X86::VPSRLVQZ256rr , X86::VPSRLVQYrr },
+ { X86::VPSRLWZ256ri , X86::VPSRLWYri },
+ { X86::VPSRLWZ256rm , X86::VPSRLWYrm },
+ { X86::VPSRLWZ256rr , X86::VPSRLWYrr },
+ { X86::VPSUBBZ256rm , X86::VPSUBBYrm },
+ { X86::VPSUBBZ256rr , X86::VPSUBBYrr },
+ { X86::VPSUBDZ256rm , X86::VPSUBDYrm },
+ { X86::VPSUBDZ256rr , X86::VPSUBDYrr },
+ { X86::VPSUBQZ256rm , X86::VPSUBQYrm },
+ { X86::VPSUBQZ256rr , X86::VPSUBQYrr },
+ { X86::VPSUBSBZ256rm , X86::VPSUBSBYrm },
+ { X86::VPSUBSBZ256rr , X86::VPSUBSBYrr },
+ { X86::VPSUBSWZ256rm , X86::VPSUBSWYrm },
+ { X86::VPSUBSWZ256rr , X86::VPSUBSWYrr },
+ { X86::VPSUBUSBZ256rm , X86::VPSUBUSBYrm },
+ { X86::VPSUBUSBZ256rr , X86::VPSUBUSBYrr },
+ { X86::VPSUBUSWZ256rm , X86::VPSUBUSWYrm },
+ { X86::VPSUBUSWZ256rr , X86::VPSUBUSWYrr },
+ { X86::VPSUBWZ256rm , X86::VPSUBWYrm },
+ { X86::VPSUBWZ256rr , X86::VPSUBWYrr },
+ { X86::VPUNPCKHBWZ256rm , X86::VPUNPCKHBWYrm },
+ { X86::VPUNPCKHBWZ256rr , X86::VPUNPCKHBWYrr },
+ { X86::VPUNPCKHDQZ256rm , X86::VPUNPCKHDQYrm },
+ { X86::VPUNPCKHDQZ256rr , X86::VPUNPCKHDQYrr },
+ { X86::VPUNPCKHQDQZ256rm , X86::VPUNPCKHQDQYrm },
+ { X86::VPUNPCKHQDQZ256rr , X86::VPUNPCKHQDQYrr },
+ { X86::VPUNPCKHWDZ256rm , X86::VPUNPCKHWDYrm },
+ { X86::VPUNPCKHWDZ256rr , X86::VPUNPCKHWDYrr },
+ { X86::VPUNPCKLBWZ256rm , X86::VPUNPCKLBWYrm },
+ { X86::VPUNPCKLBWZ256rr , X86::VPUNPCKLBWYrr },
+ { X86::VPUNPCKLDQZ256rm , X86::VPUNPCKLDQYrm },
+ { X86::VPUNPCKLDQZ256rr , X86::VPUNPCKLDQYrr },
+ { X86::VPUNPCKLQDQZ256rm , X86::VPUNPCKLQDQYrm },
+ { X86::VPUNPCKLQDQZ256rr , X86::VPUNPCKLQDQYrr },
+ { X86::VPUNPCKLWDZ256rm , X86::VPUNPCKLWDYrm },
+ { X86::VPUNPCKLWDZ256rr , X86::VPUNPCKLWDYrr },
+ { X86::VPXORDZ256rm , X86::VPXORYrm },
+ { X86::VPXORDZ256rr , X86::VPXORYrr },
+ { X86::VPXORQZ256rm , X86::VPXORYrm },
+ { X86::VPXORQZ256rr , X86::VPXORYrr },
+ { X86::VSHUFPDZ256rmi , X86::VSHUFPDYrmi },
+ { X86::VSHUFPDZ256rri , X86::VSHUFPDYrri },
+ { X86::VSHUFPSZ256rmi , X86::VSHUFPSYrmi },
+ { X86::VSHUFPSZ256rri , X86::VSHUFPSYrri },
+ { X86::VSQRTPDZ256m , X86::VSQRTPDYm },
+ { X86::VSQRTPDZ256r , X86::VSQRTPDYr },
+ { X86::VSQRTPSZ256m , X86::VSQRTPSYm },
+ { X86::VSQRTPSZ256r , X86::VSQRTPSYr },
+ { X86::VSUBPDZ256rm , X86::VSUBPDYrm },
+ { X86::VSUBPDZ256rr , X86::VSUBPDYrr },
+ { X86::VSUBPSZ256rm , X86::VSUBPSYrm },
+ { X86::VSUBPSZ256rr , X86::VSUBPSYrr },
+ { X86::VUNPCKHPDZ256rm , X86::VUNPCKHPDYrm },
+ { X86::VUNPCKHPDZ256rr , X86::VUNPCKHPDYrr },
+ { X86::VUNPCKHPSZ256rm , X86::VUNPCKHPSYrm },
+ { X86::VUNPCKHPSZ256rr , X86::VUNPCKHPSYrr },
+ { X86::VUNPCKLPDZ256rm , X86::VUNPCKLPDYrm },
+ { X86::VUNPCKLPDZ256rr , X86::VUNPCKLPDYrr },
+ { X86::VUNPCKLPSZ256rm , X86::VUNPCKLPSYrm },
+ { X86::VUNPCKLPSZ256rr , X86::VUNPCKLPSYrr },
+ { X86::VXORPDZ256rm , X86::VXORPDYrm },
+ { X86::VXORPDZ256rr , X86::VXORPDYrr },
+ { X86::VXORPSZ256rm , X86::VXORPSYrm },
+ { X86::VXORPSZ256rr , X86::VXORPSYrr },
+};
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
index f49917b..2b296e1 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -85,12 +85,12 @@ let ExeDomain = SSEPackedDouble in {
multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType vt128> {
- def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+ def rr : IXOP<opc, MRMSrcReg4VOp3, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
- XOP_4VOp3, Sched<[WriteVarVecShift]>;
+ XOP, Sched<[WriteVarVecShift]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -98,13 +98,20 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
- def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+ def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
(ins i128mem:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
(vt128 VR128:$src2))))]>,
- XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>;
+ XOP, Sched<[WriteVarVecShift, ReadAfterLd]>;
+ // For disassembler
+ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+ def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>,
+ XOP_4V, VEX_W, Sched<[WriteVarVecShift]>;
}
let ExeDomain = SSEPackedInt in {
@@ -146,19 +153,19 @@ let ExeDomain = SSEPackedInt in {
// Instruction where second source can be memory, but third must be register
multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
let isCommutable = 1 in
- def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ def rr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, VEX_I8IMM;
- def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V;
+ def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
- VR128:$src3))]>, XOP_4V, VEX_I8IMM;
+ VR128:$src3))]>, XOP_4V;
}
let ExeDomain = SSEPackedInt in {
@@ -224,37 +231,37 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType vt128> {
- def rrr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(vt128 VR128:$src3))))]>,
- XOP_4V, VEX_I8IMM;
- def rrm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ XOP_4V;
+ def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
- XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
- def rmr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ XOP_4V, VEX_W;
+ def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
(vt128 VR128:$src3))))]>,
- XOP_4V, VEX_I8IMM;
+ XOP_4V;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
- def rrr_REV : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
+ []>, XOP_4V, VEX_W;
}
let ExeDomain = SSEPackedInt in {
@@ -265,66 +272,66 @@ let ExeDomain = SSEPackedInt in {
multiclass xop4op_int<bits<8> opc, string OpcodeStr,
Intrinsic Int128, Intrinsic Int256> {
// 128-bit Instruction
- def rrr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>,
- XOP_4V, VEX_I8IMM;
- def rrm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ XOP_4V;
+ def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(Int128 VR128:$src1, VR128:$src2,
(bitconvert (loadv2i64 addr:$src3))))]>,
- XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
- def rmr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+ XOP_4V, VEX_W;
+ def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
VR128:$src3))]>,
- XOP_4V, VEX_I8IMM;
+ XOP_4V;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
- def rrr_REV : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+ def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4;
+ []>, XOP_4V, VEX_W;
// 256-bit Instruction
- def rrrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+ def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>,
- XOP_4V, VEX_I8IMM, VEX_L;
- def rrmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+ XOP_4V, VEX_L;
+ def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i256mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst,
(Int256 VR256:$src1, VR256:$src2,
(bitconvert (loadv4i64 addr:$src3))))]>,
- XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
- def rmrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+ XOP_4V, VEX_W, VEX_L;
+ def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst,
(Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
VR256:$src3))]>,
- XOP_4V, VEX_I8IMM, VEX_L;
+ XOP_4V, VEX_L;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
- def rrrY_REV : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+ def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
+ []>, XOP_4V, VEX_W, VEX_L;
}
let ExeDomain = SSEPackedInt in {
@@ -353,7 +360,7 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(id128 VR128:$src3), (i8 imm:$src4))))]>;
- def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+ def rm : IXOP5<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
@@ -361,7 +368,7 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(id128 (bitconvert (loadv2i64 addr:$src3))),
(i8 imm:$src4))))]>,
- VEX_W, MemOp4;
+ VEX_W;
def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
@@ -372,11 +379,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
(id128 VR128:$src3), (i8 imm:$src4))))]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
- def rr_REV : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+ def rr_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>, VEX_W, MemOp4;
+ []>, VEX_W;
def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
@@ -385,14 +392,14 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR256:$dst,
(vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
(id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
- def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+ def rmY : IXOP5<opc, MRMSrcMemOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set VR256:$dst,
(vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
(id256 (bitconvert (loadv4i64 addr:$src3))),
- (i8 imm:$src4))))]>, VEX_W, MemOp4, VEX_L;
+ (i8 imm:$src4))))]>, VEX_W, VEX_L;
def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
@@ -403,11 +410,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
(id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
- def rrY_REV : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+ def rrY_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>, VEX_W, MemOp4, VEX_L;
+ []>, VEX_W, VEX_L;
}
let ExeDomain = SSEPackedDouble in
diff --git a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
new file mode 100644
index 0000000..d9edf46
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -0,0 +1,221 @@
+//===--------- X86InterleavedAccess.cpp ----------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the X86 implementation of the interleaved accesses
+/// optimization generating X86-specific instructions/intrinsics for
+/// interleaved access groups.
+///
+//===--------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+/// \brief This class holds necessary information to represent an interleaved
+/// access group and supports utilities to lower the group into
+/// X86-specific instructions/intrinsics.
+/// E.g. A group of interleaving access loads (Factor = 2; accessing every
+/// other element)
+/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
+/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+
+class X86InterleavedAccessGroup {
+ /// \brief Reference to the wide-load instruction of an interleaved access
+ /// group.
+ Instruction *const Inst;
+
+ /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
+ ArrayRef<ShuffleVectorInst *> Shuffles;
+
+ /// \brief Reference to the starting index of each user-shuffle.
+ ArrayRef<unsigned> Indices;
+
+ /// \brief Reference to the interleaving stride in terms of elements.
+ const unsigned Factor;
+
+ /// \brief Reference to the underlying target.
+ const X86Subtarget &Subtarget;
+
+ const DataLayout &DL;
+
+ IRBuilder<> &Builder;
+
+ /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
+ /// sub vectors of type \p T. Returns true and the sub-vectors in
+ /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
+ bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+ SmallVectorImpl<Instruction *> &DecomposedVectors);
+
+ /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
+ /// returns the transposed-vectors in \p TransposedVectors.
+ /// E.g.
+ /// InputVectors:
+ /// In-V0 = p1, p2, p3, p4
+ /// In-V1 = q1, q2, q3, q4
+ /// In-V2 = r1, r2, r3, r4
+ /// In-V3 = s1, s2, s3, s4
+ /// OutputVectors:
+ /// Out-V0 = p1, q1, r1, s1
+ /// Out-V1 = p2, q2, r2, s2
+ /// Out-V2 = p3, q3, r3, s3
+ /// Out-V3 = P4, q4, r4, s4
+ void transpose_4x4(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TrasposedVectors);
+
+public:
+ /// In order to form an interleaved access group X86InterleavedAccessGroup
+ /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
+ /// \p Shuffs, reference to the first indices of each interleaved-vector
+ /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
+ /// X86-specific instructions/intrinsics it also requires the underlying
+ /// target information \p STarget.
+ explicit X86InterleavedAccessGroup(Instruction *I,
+ ArrayRef<ShuffleVectorInst *> Shuffs,
+ ArrayRef<unsigned> Ind,
+ const unsigned F,
+ const X86Subtarget &STarget,
+ IRBuilder<> &B)
+ : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
+ DL(Inst->getModule()->getDataLayout()), Builder(B) {}
+
+ /// \brief Returns true if this interleaved access group can be lowered into
+ /// x86-specific instructions/intrinsics, false otherwise.
+ bool isSupported() const;
+
+ /// \brief Lowers this interleaved access group into X86-specific
+ /// instructions/intrinsics.
+ bool lowerIntoOptimizedSequence();
+};
+
+bool X86InterleavedAccessGroup::isSupported() const {
+ VectorType *ShuffleVecTy = Shuffles[0]->getType();
+ uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
+ Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
+
+ if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
+ return false;
+
+ // Currently, lowering is supported for 64 bits on AVX.
+ if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
+ DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
+ return false;
+
+ return true;
+}
+
+bool X86InterleavedAccessGroup::decompose(
+ Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
+ SmallVectorImpl<Instruction *> &DecomposedVectors) {
+ Type *VecTy = VecInst->getType();
+ (void)VecTy;
+ assert(VecTy->isVectorTy() &&
+ DL.getTypeSizeInBits(VecTy) >=
+ DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
+ "Invalid Inst-size!!!");
+ assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
+ "Element type mismatched!!!");
+
+ if (!isa<LoadInst>(VecInst))
+ return false;
+
+ LoadInst *LI = cast<LoadInst>(VecInst);
+ Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
+
+ Value *VecBasePtr =
+ Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
+
+ // Generate N loads of T type
+ for (unsigned i = 0; i < NumSubVectors; i++) {
+ // TODO: Support inbounds GEP
+ Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
+ Instruction *NewLoad =
+ Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
+ DecomposedVectors.push_back(NewLoad);
+ }
+
+ return true;
+}
+
+void X86InterleavedAccessGroup::transpose_4x4(
+ ArrayRef<Instruction *> Matrix,
+ SmallVectorImpl<Value *> &TransposedMatrix) {
+ assert(Matrix.size() == 4 && "Invalid matrix size");
+ TransposedMatrix.resize(4);
+
+ // dst = src1[0,1],src2[0,1]
+ uint32_t IntMask1[] = {0, 1, 4, 5};
+ ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
+ Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
+ Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
+
+ // dst = src1[2,3],src2[2,3]
+ uint32_t IntMask2[] = {2, 3, 6, 7};
+ Mask = makeArrayRef(IntMask2, 4);
+ Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
+ Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
+
+ // dst = src1[0],src2[0],src1[2],src2[2]
+ uint32_t IntMask3[] = {0, 4, 2, 6};
+ Mask = makeArrayRef(IntMask3, 4);
+ TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
+ TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
+
+ // dst = src1[1],src2[1],src1[3],src2[3]
+ uint32_t IntMask4[] = {1, 5, 3, 7};
+ Mask = makeArrayRef(IntMask4, 4);
+ TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
+ TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
+}
+
+// Lowers this interleaved access group into X86-specific
+// instructions/intrinsics.
+bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
+ SmallVector<Instruction *, 4> DecomposedVectors;
+ VectorType *VecTy = Shuffles[0]->getType();
+ // Try to generate target-sized register(/instruction).
+ if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
+ return false;
+
+ SmallVector<Value *, 4> TransposedVectors;
+ // Perform matrix-transposition in order to compute interleaved
+ // results by generating some sort of (optimized) target-specific
+ // instructions.
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+
+ // Now replace the unoptimized-interleaved-vectors with the
+ // transposed-interleaved vectors.
+ for (unsigned i = 0; i < Shuffles.size(); i++)
+ Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+ return true;
+}
+
+// Lower interleaved load(s) into target specific instructions/
+// intrinsics. Lowering sequence varies depending on the vector-types, factor,
+// number of shuffles and ISA.
+// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
+bool X86TargetLowering::lowerInterleavedLoad(
+ LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices, unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+ assert(!Shuffles.empty() && "Empty shufflevector input");
+ assert(Shuffles.size() == Indices.size() &&
+ "Unmatched number of shufflevectors and indices");
+
+ // Create an interleaved access group.
+ IRBuilder<> Builder(LI);
+ X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
+ Builder);
+
+ return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index b647d11..63a02af 100644
--- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -21,9 +21,10 @@ namespace llvm {
enum IntrinsicType : uint16_t {
INTR_NO_TYPE,
- GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS,
- INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP,
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS,
+ INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
+ CVTPD2PS, CVTPD2PS_MASK,
INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
@@ -33,7 +34,7 @@ enum IntrinsicType : uint16_t {
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
- EXPAND_FROM_MEM, INSERT_SUBVEC,
+ EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
};
@@ -184,6 +185,79 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+ X86ISD::VTRUNCUS, 0),
+
X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
@@ -228,6 +302,7 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0),
X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0),
+ X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0),
};
@@ -250,6 +325,11 @@ static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
* the alphabetical order.
*/
static const IntrinsicData IntrinsicsWithoutChain[] = {
+ X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
+ X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+ X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
@@ -288,8 +368,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
@@ -353,21 +436,20 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
- X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
- X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+ X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
@@ -377,30 +459,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
- X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0),
X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD,
- X86ISD::FADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD,
- X86ISD::FADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_and_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_and_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_and_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_and_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_and_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0),
- X86_INTRINSIC_DATA(avx512_mask_andn_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
- X86_INTRINSIC_DATA(avx512_mask_andn_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
- X86_INTRINSIC_DATA(avx512_mask_andn_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
- X86_INTRINSIC_DATA(avx512_mask_andn_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
- X86_INTRINSIC_DATA(avx512_mask_andn_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
- X86_INTRINSIC_DATA(avx512_mask_andn_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FADD_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
@@ -452,10 +518,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
- X86ISD::FSETCC),
- X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCC,
- X86ISD::FSETCC),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
+ X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
+ X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
@@ -495,184 +561,168 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CONFLICT, 0),
X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
X86ISD::CONFLICT, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK,
- X86ISD::CVTDQ2PD, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0), // no rm
X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er
+ ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er
X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, 0),
+ X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, 0),
+ X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK,
X86ISD::VFPROUND, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, CVTPD2PS_MASK,
ISD::FP_ROUND, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM,
- ISD::FP_ROUND, X86ISD::VFPROUND),
+ X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_MASK,
+ ISD::FP_ROUND, X86ISD::VFPROUND_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, 0),
+ X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, 0),
+ X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, 0),
+ X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, 0),
+ X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, 0),
+ X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, 0),
+ X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, 0),
+ X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, 0),
+ X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
X86ISD::VFPEXT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
ISD::FP_EXTEND, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
- ISD::FP_EXTEND, X86ISD::VFPEXT),
+ ISD::FP_EXTEND, X86ISD::VFPEXT_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, 0),
+ X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, 0),
+ X86ISD::CVTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND),
+ X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, 0),
+ X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, 0),
+ X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, 0),
+ X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, 0),
+ X86ISD::CVTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
- X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND),
+ X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+ ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, 0),
+ X86ISD::CVTSI2P, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::SINT_TO_FP, ISD::SINT_TO_FP),
+ ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VFPROUND, 0),
+ X86ISD::VFPROUNDS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VFPEXT, 0),
+ X86ISD::VFPEXTS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, 0),
+ X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, 0),
+ X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_UINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
ISD::FP_TO_UINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_UINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, 0),
+ X86ISD::CVTTP2SI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_SINT, ISD::FP_TO_SINT),
+ ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
ISD::FP_TO_UINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_UINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, ISD::FP_TO_UINT),
+ ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, 0),
+ X86ISD::CVTTP2UI, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_UINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
- ISD::FP_TO_UINT, ISD::FP_TO_UINT),
- X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK,
- X86ISD::CVTUDQ2PD, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0), // no rm
+ ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
ISD::UINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
ISD::UINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
ISD::UINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
ISD::UINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, 0),
+ X86ISD::CVTUI2P, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
ISD::UINT_TO_FP, 0),
X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
- ISD::UINT_TO_FP, ISD::UINT_TO_FP),
+ ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
X86ISD::DBPSADBW, 0),
X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
X86ISD::DBPSADBW, 0),
- X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
- X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
X86ISD::FDIV_RND),
- X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
- X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0),
X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
X86ISD::FDIV_RND),
- X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
- X86ISD::FDIV_RND),
- X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
- X86ISD::FDIV_RND),
+ X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FDIV_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FDIV_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
@@ -726,9 +776,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
X86ISD::FGETEXP_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
+ X86ISD::FGETEXPS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FGETEXP_RND, 0),
+ X86ISD::FGETEXPS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM,
X86ISD::VGETMANT, 0),
X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM,
@@ -742,33 +792,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM,
X86ISD::VGETMANT, 0),
X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM,
- X86ISD::VGETMANT, 0),
+ X86ISD::VGETMANTS, 0),
X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
- X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
+ X86ISD::VGETMANTS, 0),
X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
ISD::CTLZ, 0),
X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
@@ -790,9 +816,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
X86ISD::FMAX_RND),
X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FMAX, X86ISD::FMAX_RND),
+ X86ISD::FMAX_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FMAX, X86ISD::FMAX_RND),
+ X86ISD::FMAX_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
@@ -802,31 +828,17 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
X86ISD::FMIN_RND),
X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FMIN, X86ISD::FMIN_RND),
+ X86ISD::FMIN_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FMIN, X86ISD::FMIN_RND),
- X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK,
- X86ISD::MOVSD, 0),
- X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK,
- X86ISD::MOVSS, 0),
- X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
+ X86ISD::FMIN_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
X86ISD::FMUL_RND),
- X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
X86ISD::FMUL_RND),
- X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL,
- X86ISD::FMUL_RND),
- X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL,
- X86ISD::FMUL_RND),
- X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_or_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_or_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_or_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_or_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_or_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMUL_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FMUL_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
@@ -851,18 +863,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_b_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_b_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_b_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_d_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_d_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_d_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_q_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_q_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_q_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_w_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_w_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_padd_w_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0),
X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
@@ -945,54 +945,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
X86ISD::VPMADDWD, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
- X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
@@ -1065,42 +1017,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VTRUNCS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNCS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_128, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_256, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_512, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_128, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_256, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_512, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_128, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_256, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_512, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_128, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_256, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_512, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_128, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_256, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_512, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_128, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_256, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_512, INTR_TYPE_1OP_MASK,
- X86ISD::VSEXT, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNCUS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
@@ -1137,48 +1053,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VTRUNCUS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNCUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_128, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_256, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_512, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_128, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_256, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_512, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_128, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_256, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_512, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_128, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_256, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_512, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_128, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_256, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_512, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_128, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_256, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_512, INTR_TYPE_1OP_MASK,
- X86ISD::VZEXT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK,
- X86ISD::PMULDQ, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK,
- X86ISD::PMULDQ, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK,
- X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
@@ -1188,27 +1062,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_q_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_q_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_q_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_w_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_w_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmull_w_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0),
X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK,
X86ISD::MULTISHIFT, 0),
X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK,
X86ISD::MULTISHIFT, 0),
X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK,
X86ISD::MULTISHIFT, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_128, INTR_TYPE_2OP_MASK,
- X86ISD::PMULUDQ, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_256, INTR_TYPE_2OP_MASK,
- X86ISD::PMULUDQ, 0),
- X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_512, INTR_TYPE_2OP_MASK,
- X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(avx512_mask_prol_d_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
X86_INTRINSIC_DATA(avx512_mask_prol_d_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
X86_INTRINSIC_DATA(avx512_mask_prol_d_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
@@ -1233,105 +1092,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
- X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK,
- X86ISD::PSHUFB, 0),
- X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK,
- X86ISD::PSHUFB, 0),
- X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK,
- X86ISD::PSHUFB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_di_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_di_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_di_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_qi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_qi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_qi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_wi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_wi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psll_wi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSHLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv16_hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv2_di, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv32hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv4_di, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv4_si, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv8_hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psllv8_si, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_di_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_di_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_di_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_qi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_qi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_qi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_wi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_wi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav16_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav32_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav4_si, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav8_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrav8_si, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_di_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_di_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_di_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRLI, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv2_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv32hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv4_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv4_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psrlv8_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_b_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_b_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_d_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_d_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_d_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_q_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_q_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_q_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_w_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_w_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_psub_w_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
@@ -1370,8 +1130,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
@@ -1379,9 +1139,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VRNDSCALE, 0),
+ X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VRNDSCALE, 0),
+ X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
X86ISD::SCALEF, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
@@ -1414,42 +1174,26 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK,
X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUFP, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUFP, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUFP, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUFP, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUFP, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUFP, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
+ X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSQRT_RND, 0),
+ X86ISD::FSQRTS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::FSQRT_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
+ X86ISD::FSQRTS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
X86ISD::FSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
- X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
X86ISD::FSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB,
- X86ISD::FSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB,
- X86ISD::FSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSUB_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+ X86ISD::FSUB_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
@@ -1462,30 +1206,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::VALIGN, 0),
- X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::VALIGN, 0),
- X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::VALIGN, 0),
- X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::VALIGN, 0),
- X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::VALIGN, 0),
- X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::VALIGN, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
- ISD::FP16_TO_FP, 0),
+ X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
- ISD::FP16_TO_FP, 0),
+ X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
- ISD::FP16_TO_FP, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM,
- ISD::FP_TO_FP16, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM,
- ISD::FP_TO_FP16, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM,
- ISD::FP_TO_FP16, 0),
+ X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK,
+ X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK,
+ X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK,
+ X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD,
@@ -1495,8 +1227,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
@@ -1555,23 +1287,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
X86ISD::VPERMIV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
+ X86ISD::VPERMIV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
+ X86ISD::VPERMIV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERM_3OP_MASK,
- X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK,
- X86ISD::VPERMILPV, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK,
- X86ISD::VPERMILPV, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK,
- X86ISD::VPERMILPV, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK,
- X86ISD::VPERMILPV, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK,
- X86ISD::VPERMILPV, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK,
- X86ISD::VPERMILPV, 0),
+ X86ISD::VPERMIV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
@@ -1620,12 +1340,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
@@ -1635,8 +1349,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
@@ -1654,6 +1368,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
X86ISD::FMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
@@ -1672,6 +1388,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
@@ -1709,8 +1427,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
@@ -1768,7 +1486,49 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+ X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_psll_q_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_psll_w_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0),
+ X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_q_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psra_w_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_d_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_128, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_256, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_q_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrai_w_512, VSHIFT, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_d_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_q_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrav_w_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_d_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_q_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrl_w_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
X86_INTRINSIC_DATA(avx512_ptestm_b_128, CMP_MASK, X86ISD::TESTM, 0),
X86_INTRINSIC_DATA(avx512_ptestm_b_256, CMP_MASK, X86ISD::TESTM, 0),
X86_INTRINSIC_DATA(avx512_ptestm_b_512, CMP_MASK, X86ISD::TESTM, 0),
@@ -1803,8 +1563,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
- X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
@@ -1815,26 +1575,20 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
- X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP,
- X86ISD::SCALAR_FP_TO_SINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_2OP,
- X86ISD::SCALAR_FP_TO_SINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_2OP,
- X86ISD::SCALAR_FP_TO_UINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_2OP,
- X86ISD::SCALAR_FP_TO_UINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_2OP,
- X86ISD::SCALAR_FP_TO_SINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP,
- X86ISD::SCALAR_FP_TO_SINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP,
- X86ISD::SCALAR_FP_TO_UINT_RND, 0),
- X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP,
- X86ISD::SCALAR_FP_TO_UINT_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+ X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0),
X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0),
@@ -1883,6 +1637,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE),
X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT),
X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
+ X86_INTRINSIC_DATA(sse2_cvtpd2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
+ X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+ X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
@@ -1895,6 +1654,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
+ X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
@@ -1943,6 +1703,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+ X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
index 906e342..feeb2fd 100644
--- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -16,6 +16,7 @@
#include "X86RegisterInfo.h"
#include "X86ShuffleDecodeConstantPool.h"
#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86InstComments.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "Utils/X86ShuffleDecode.h"
#include "llvm/ADT/Optional.h"
@@ -41,6 +42,7 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/ELF.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
@@ -68,9 +70,6 @@ public:
private:
MachineModuleInfoMachO &getMachOMMI() const;
- Mangler *getMang() const {
- return AsmPrinter.Mang;
- }
};
} // end anonymous namespace
@@ -499,18 +498,13 @@ ReSimplify:
break;
}
- // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
- case X86::TAILJMPr:
+ // TAILJMPd, TAILJMPd64 - Lower to the correct jump instruction.
+ { unsigned Opcode;
+ case X86::TAILJMPr: Opcode = X86::JMP32r; goto SetTailJmpOpcode;
case X86::TAILJMPd:
- case X86::TAILJMPd64: {
- unsigned Opcode;
- switch (OutMI.getOpcode()) {
- default: llvm_unreachable("Invalid opcode");
- case X86::TAILJMPr: Opcode = X86::JMP32r; break;
- case X86::TAILJMPd:
- case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
- }
+ case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode;
+ SetTailJmpOpcode:
MCOperand Saved = OutMI.getOperand(0);
OutMI = MCInst();
OutMI.setOpcode(Opcode);
@@ -979,8 +973,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
PatchPointOpers opers(&MI);
unsigned ScratchIdx = opers.getNextScratchIdx();
unsigned EncodedBytes = 0;
- const MachineOperand &CalleeMO =
- opers.getMetaOper(PatchPointOpers::TargetPos);
+ const MachineOperand &CalleeMO = opers.getCallTarget();
// Check for null target. If target is non-null (i.e. is non-zero or is
// symbolic) then emit a call.
@@ -1016,7 +1009,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
}
// Emit padding.
- unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+ unsigned NumBytes = opers.getNumPatchBytes();
assert(NumBytes >= EncodedBytes &&
"Patchpoint can't request size less than the length of a call.");
@@ -1024,22 +1017,12 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
getSubtargetInfo());
}
-void X86AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
- SledKind Kind) {
- auto Fn = MI.getParent()->getParent()->getFunction();
- auto Attr = Fn->getFnAttribute("function-instrument");
- bool AlwaysInstrument =
- Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always";
- Sleds.emplace_back(
- XRayFunctionEntry{Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn});
-}
-
void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
X86MCInstLower &MCIL) {
// We want to emit the following pattern:
//
+ // .p2align 1, ...
// .Lxray_sled_N:
- // .palign 2, ...
// jmp .tmpN
// # 9 bytes worth of noops
// .tmpN
@@ -1051,8 +1034,8 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
// call <relative offset, 32-bits> // 5 bytes
//
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitCodeAlignment(2);
OutStreamer->EmitLabel(CurSled);
- OutStreamer->EmitCodeAlignment(4);
auto Target = OutContext.createTempSymbol();
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
@@ -1074,12 +1057,14 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
//
// We should emit the RET followed by sleds.
//
+ // .p2align 1, ...
// .Lxray_sled_N:
// ret # or equivalent instruction
// # 10 bytes worth of noops
//
// This just makes sure that the alignment for the next instruction is 2.
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitCodeAlignment(2);
OutStreamer->EmitLabel(CurSled);
unsigned OpCode = MI.getOperand(0).getImm();
MCInst Ret;
@@ -1092,29 +1077,37 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
}
-void X86AsmPrinter::EmitXRayTable() {
- if (Sleds.empty())
- return;
- if (Subtarget->isTargetELF()) {
- auto *Section = OutContext.getELFSection(
- "xray_instr_map", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC | ELF::SHF_GROUP | ELF::SHF_MERGE, 0,
- CurrentFnSym->getName());
- auto PrevSection = OutStreamer->getCurrentSectionOnly();
- OutStreamer->SwitchSection(Section);
- for (const auto &Sled : Sleds) {
- OutStreamer->EmitSymbolValue(Sled.Sled, 8);
- OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
- auto Kind = static_cast<uint8_t>(Sled.Kind);
- OutStreamer->EmitBytes(
- StringRef(reinterpret_cast<const char *>(&Kind), 1));
- OutStreamer->EmitBytes(
- StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
- OutStreamer->EmitZeros(14);
- }
- OutStreamer->SwitchSection(PrevSection);
- }
- Sleds.clear();
+void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) {
+ // Like PATCHABLE_RET, we have the actual instruction in the operands to this
+ // instruction so we lower that particular instruction and its operands.
+ // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
+ // we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to
+ // the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
+ // tail call much like how we have it in PATCHABLE_RET.
+ auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+ OutStreamer->EmitCodeAlignment(2);
+ OutStreamer->EmitLabel(CurSled);
+ auto Target = OutContext.createTempSymbol();
+
+ // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+ // an operand (computed as an offset from the jmp instruction).
+ // FIXME: Find another less hacky way do force the relative jump.
+ OutStreamer->EmitBytes("\xeb\x09");
+ EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
+ OutStreamer->EmitLabel(Target);
+ recordSled(CurSled, MI, SledKind::TAIL_CALL);
+
+ unsigned OpCode = MI.getOperand(0).getImm();
+ MCInst TC;
+ TC.setOpcode(OpCode);
+
+ // Before emitting the instruction, add a comment to indicate that this is
+ // indeed a tail call.
+ OutStreamer->AddComment("TAILCALL");
+ for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+ if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+ TC.addOperand(MaybeOperand.getValue());
+ OutStreamer->EmitInstruction(TC, getSubtargetInfo());
}
// Returns instruction preceding MBBI in MachineFunction.
@@ -1152,9 +1145,9 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
return C;
}
-static std::string getShuffleComment(const MachineOperand &DstOp,
- const MachineOperand &SrcOp1,
- const MachineOperand &SrcOp2,
+static std::string getShuffleComment(const MachineInstr *MI,
+ unsigned SrcOp1Idx,
+ unsigned SrcOp2Idx,
ArrayRef<int> Mask) {
std::string Comment;
@@ -1167,7 +1160,10 @@ static std::string getShuffleComment(const MachineOperand &DstOp,
return X86ATTInstPrinter::getRegisterName(RegNum);
};
- // TODO: Add support for specifying an AVX512 style mask register in the comment.
+ const MachineOperand &DstOp = MI->getOperand(0);
+ const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx);
+ const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx);
+
StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
StringRef Src1Name =
SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
@@ -1182,7 +1178,26 @@ static std::string getShuffleComment(const MachineOperand &DstOp,
ShuffleMask[i] -= e;
raw_string_ostream CS(Comment);
- CS << DstName << " = ";
+ CS << DstName;
+
+ // Handle AVX512 MASK/MASXZ write mask comments.
+ // MASK: zmmX {%kY}
+ // MASKZ: zmmX {%kY} {z}
+ if (SrcOp1Idx > 1) {
+ assert((SrcOp1Idx == 2 || SrcOp1Idx == 3) && "Unexpected writemask");
+
+ const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1);
+ if (WriteMaskOp.isReg()) {
+ CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}";
+
+ if (SrcOp1Idx == 2) {
+ CS << " {z}";
+ }
+ }
+ }
+
+ CS << " = ";
+
for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
if (i != 0)
CS << ",";
@@ -1221,6 +1236,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
X86MCInstLower MCInstLowering(*MF, *this);
const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+ // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
+ // are compressed from EVEX encoding to VEX encoding.
+ if (TM.Options.MCOptions.ShowMCEncoding) {
+ if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX)
+ OutStreamer->AddComment("EVEX TO VEX Compression ", false);
+ }
+
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
llvm_unreachable("Should be handled target independently");
@@ -1259,7 +1281,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::TAILJMPd64:
case X86::TAILJMPr64_REX:
case X86::TAILJMPm64_REX:
- case X86::TAILJMPd64_REX:
// Lower these as normal, but add some comments.
OutStreamer->AddComment("TAILCALL");
break;
@@ -1364,6 +1385,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case TargetOpcode::PATCHABLE_RET:
return LowerPATCHABLE_RET(*MI, MCInstLowering);
+ case TargetOpcode::PATCHABLE_TAIL_CALL:
+ return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
+
case X86::MORESTACK_RET:
EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
return;
@@ -1377,37 +1401,45 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
case X86::SEH_PushReg:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
return;
case X86::SEH_SaveReg:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
MI->getOperand(1).getImm());
return;
case X86::SEH_SaveXMM:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
MI->getOperand(1).getImm());
return;
case X86::SEH_StackAlloc:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
return;
case X86::SEH_SetFrame:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
MI->getOperand(1).getImm());
return;
case X86::SEH_PushFrame:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
return;
case X86::SEH_EndPrologue:
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFIEndProlog();
return;
case X86::SEH_Epilogue: {
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
MachineBasicBlock::const_iterator MBBI(MI);
// Check if preceded by a call and emit nop if so.
for (MBBI = PrevCrossBBInst(MBBI);
@@ -1463,59 +1495,84 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
assert(MI->getNumOperands() >= 6 &&
"We should always have at least 6 operands!");
- const MachineOperand &DstOp = MI->getOperand(0);
- const MachineOperand &SrcOp = MI->getOperand(SrcIdx);
- const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+ const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
- SmallVector<int, 16> Mask;
+ SmallVector<int, 64> Mask;
DecodePSHUFBMask(C, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask));
+ OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPSZ128rm:
+ case X86::VPERMILPSZ128rmk:
+ case X86::VPERMILPSZ128rmkz:
+ case X86::VPERMILPSZ256rm:
+ case X86::VPERMILPSZ256rmk:
+ case X86::VPERMILPSZ256rmkz:
+ case X86::VPERMILPSZrm:
+ case X86::VPERMILPSZrmk:
+ case X86::VPERMILPSZrmkz:
case X86::VPERMILPDrm:
case X86::VPERMILPDYrm:
case X86::VPERMILPDZ128rm:
+ case X86::VPERMILPDZ128rmk:
+ case X86::VPERMILPDZ128rmkz:
case X86::VPERMILPDZ256rm:
- case X86::VPERMILPDZrm: {
+ case X86::VPERMILPDZ256rmk:
+ case X86::VPERMILPDZ256rmkz:
+ case X86::VPERMILPDZrm:
+ case X86::VPERMILPDZrmk:
+ case X86::VPERMILPDZrmkz: {
if (!OutStreamer->isVerboseAsm())
break;
- assert(MI->getNumOperands() > 5 &&
- "We should always have at least 5 operands!");
- const MachineOperand &DstOp = MI->getOperand(0);
- const MachineOperand &SrcOp = MI->getOperand(1);
- const MachineOperand &MaskOp = MI->getOperand(5);
-
- if (auto *C = getConstantFromPool(*MI, MaskOp)) {
- SmallVector<int, 8> Mask;
- DecodeVPERMILPMask(C, 64, Mask);
- if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask));
+ unsigned SrcIdx, MaskIdx;
+ unsigned ElSize;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::VPERMILPSrm:
+ case X86::VPERMILPSYrm:
+ case X86::VPERMILPSZ128rm:
+ case X86::VPERMILPSZ256rm:
+ case X86::VPERMILPSZrm:
+ SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
+ case X86::VPERMILPSZ128rmkz:
+ case X86::VPERMILPSZ256rmkz:
+ case X86::VPERMILPSZrmkz:
+ SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
+ case X86::VPERMILPSZ128rmk:
+ case X86::VPERMILPSZ256rmk:
+ case X86::VPERMILPSZrmk:
+ SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
+ case X86::VPERMILPDrm:
+ case X86::VPERMILPDYrm:
+ case X86::VPERMILPDZ128rm:
+ case X86::VPERMILPDZ256rm:
+ case X86::VPERMILPDZrm:
+ SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
+ case X86::VPERMILPDZ128rmkz:
+ case X86::VPERMILPDZ256rmkz:
+ case X86::VPERMILPDZrmkz:
+ SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
+ case X86::VPERMILPDZ128rmk:
+ case X86::VPERMILPDZ256rmk:
+ case X86::VPERMILPDZrmk:
+ SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
}
- break;
- }
- case X86::VPERMILPSrm:
- case X86::VPERMILPSYrm:
- case X86::VPERMILPSZ128rm:
- case X86::VPERMILPSZ256rm:
- case X86::VPERMILPSZrm: {
- if (!OutStreamer->isVerboseAsm())
- break;
- assert(MI->getNumOperands() > 5 &&
- "We should always have at least 5 operands!");
- const MachineOperand &DstOp = MI->getOperand(0);
- const MachineOperand &SrcOp = MI->getOperand(1);
- const MachineOperand &MaskOp = MI->getOperand(5);
+ assert(MI->getNumOperands() >= 6 &&
+ "We should always have at least 6 operands!");
+ const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 16> Mask;
- DecodeVPERMILPMask(C, 32, Mask);
+ DecodeVPERMILPMask(C, ElSize, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, SrcOp, Mask));
+ OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
@@ -1526,14 +1583,10 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VPERMIL2PSrmY: {
if (!OutStreamer->isVerboseAsm())
break;
- assert(MI->getNumOperands() > 7 &&
- "We should always have at least 7 operands!");
- const MachineOperand &DstOp = MI->getOperand(0);
- const MachineOperand &SrcOp1 = MI->getOperand(1);
- const MachineOperand &SrcOp2 = MI->getOperand(2);
- const MachineOperand &MaskOp = MI->getOperand(6);
- const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
+ assert(MI->getNumOperands() >= 8 &&
+ "We should always have at least 8 operands!");
+ const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
if (!CtrlOp.isImm())
break;
@@ -1544,11 +1597,12 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
}
+ const MachineOperand &MaskOp = MI->getOperand(6);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 16> Mask;
DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp1, SrcOp2, Mask));
+ OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
@@ -1556,18 +1610,15 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VPPERMrrm: {
if (!OutStreamer->isVerboseAsm())
break;
- assert(MI->getNumOperands() > 6 &&
- "We should always have at least 6 operands!");
- const MachineOperand &DstOp = MI->getOperand(0);
- const MachineOperand &SrcOp1 = MI->getOperand(1);
- const MachineOperand &SrcOp2 = MI->getOperand(2);
- const MachineOperand &MaskOp = MI->getOperand(6);
+ assert(MI->getNumOperands() >= 7 &&
+ "We should always have at least 7 operands!");
+ const MachineOperand &MaskOp = MI->getOperand(6);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 16> Mask;
DecodeVPPERMMask(C, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp1, SrcOp2, Mask));
+ OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
@@ -1605,7 +1656,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
CASE_ALL_MOV_RM()
if (!OutStreamer->isVerboseAsm())
break;
- if (MI->getNumOperands() > 4)
+ if (MI->getNumOperands() <= 4)
+ break;
if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
std::string Comment;
raw_string_ostream CS(Comment);
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 4da0fdd..e144700 100644
--- a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -44,12 +44,6 @@ static cl::opt<bool>
STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
-class MemOpKey;
-
-/// \brief Returns a hash table key based on memory operands of \p MI. The
-/// number of the first memory operand of \p MI is specified through \p N.
-static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N);
-
/// \brief Returns true if two machine operands are identical and they are not
/// physical registers.
static inline bool isIdenticalOp(const MachineOperand &MO1,
@@ -63,6 +57,7 @@ static bool isSimilarDispOp(const MachineOperand &MO1,
/// \brief Returns true if the instruction is LEA.
static inline bool isLEA(const MachineInstr &MI);
+namespace {
/// A key based on instruction's memory operands.
class MemOpKey {
public:
@@ -95,6 +90,7 @@ public:
// Address' displacement operand.
const MachineOperand *Disp;
};
+} // end anonymous namespace
/// Provide DenseMapInfo for MemOpKey.
namespace llvm {
@@ -168,6 +164,8 @@ template <> struct DenseMapInfo<MemOpKey> {
};
}
+/// \brief Returns a hash table key based on memory operands of \p MI. The
+/// number of the first memory operand of \p MI is specified through \p N.
static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
assert((isLEA(MI) || MI.mayLoadOrStore()) &&
"The instruction must be a LEA, a load or a store");
@@ -221,7 +219,7 @@ class OptimizeLEAPass : public MachineFunctionPass {
public:
OptimizeLEAPass() : MachineFunctionPass(ID) {}
- const char *getPassName() const override { return "X86 LEA Optimize"; }
+ StringRef getPassName() const override { return "X86 LEA Optimize"; }
/// \brief Loop over all of the basic blocks, replacing address
/// calculations in load and store instructions, if it's already
@@ -237,7 +235,7 @@ private:
/// \brief Choose the best \p LEA instruction from the \p List to replace
/// address calculation in \p MI instruction. Return the address displacement
- /// and the distance between \p MI and the choosen \p BestLEA in
+ /// and the distance between \p MI and the chosen \p BestLEA in
/// \p AddrDispShift and \p Dist.
bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
const MachineInstr &MI, MachineInstr *&BestLEA,
@@ -551,10 +549,10 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
MachineInstr &Last = **I2;
int64_t AddrDispShift;
- // LEAs should be in occurence order in the list, so we can freely
+ // LEAs should be in occurrence order in the list, so we can freely
// replace later LEAs with earlier ones.
assert(calcInstrDist(First, Last) > 0 &&
- "LEAs must be in occurence order in the list");
+ "LEAs must be in occurrence order in the list");
// Check that the Last LEA instruction can be replaced by the First.
if (!isReplaceable(First, Last, AddrDispShift)) {
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
index 62a9aaf..3069d1f 100644
--- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -57,10 +57,10 @@ namespace {
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::AllVRegsAllocated);
+ MachineFunctionProperties::Property::NoVRegs);
}
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "X86 Atom pad short functions";
}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 8675063..65f438f 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -128,21 +128,44 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
if (RC == &X86::GR8_NOREXRegClass)
return RC;
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+
const TargetRegisterClass *Super = RC;
TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
do {
switch (Super->getID()) {
+ case X86::FR32RegClassID:
+ case X86::FR64RegClassID:
+ // If AVX-512 isn't supported we should only inflate to these classes.
+ if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+ return Super;
+ break;
+ case X86::VR128RegClassID:
+ case X86::VR256RegClassID:
+ // If VLX isn't supported we should only inflate to these classes.
+ if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+ return Super;
+ break;
+ case X86::VR128XRegClassID:
+ case X86::VR256XRegClassID:
+ // If VLX isn't support we shouldn't inflate to these classes.
+ if (Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+ return Super;
+ break;
+ case X86::FR32XRegClassID:
+ case X86::FR64XRegClassID:
+ // If AVX-512 isn't support we shouldn't inflate to these classes.
+ if (Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+ return Super;
+ break;
case X86::GR8RegClassID:
case X86::GR16RegClassID:
case X86::GR32RegClassID:
case X86::GR64RegClassID:
- case X86::FR32RegClassID:
- case X86::FR64RegClassID:
case X86::RFP32RegClassID:
case X86::RFP64RegClassID:
case X86::RFP80RegClassID:
- case X86::VR128RegClassID:
- case X86::VR256RegClassID:
+ case X86::VR512RegClassID:
// Don't return a super-class that would shrink the spill size.
// That can happen with the vector and float classes.
if (Super->getSize() == RC->getSize())
@@ -241,13 +264,14 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
const MCPhysReg *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ assert(MF && "MachineFunction required");
+
const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
bool HasSSE = Subtarget.hasSSE1();
bool HasAVX = Subtarget.hasAVX();
bool HasAVX512 = Subtarget.hasAVX512();
- bool CallsEHReturn = MF->getMMI().callsEHReturn();
+ bool CallsEHReturn = MF->callsEHReturn();
- assert(MF && "MachineFunction required");
switch (MF->getFunction()->getCallingConv()) {
case CallingConv::GHC:
case CallingConv::HiPE:
@@ -282,11 +306,26 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
}
case CallingConv::HHVM:
return CSR_64_HHVM_SaveList;
+ case CallingConv::X86_RegCall:
+ if (Is64Bit) {
+ if (IsWin64) {
+ return (HasSSE ? CSR_Win64_RegCall_SaveList :
+ CSR_Win64_RegCall_NoSSE_SaveList);
+ } else {
+ return (HasSSE ? CSR_SysV64_RegCall_SaveList :
+ CSR_SysV64_RegCall_NoSSE_SaveList);
+ }
+ } else {
+ return (HasSSE ? CSR_32_RegCall_SaveList :
+ CSR_32_RegCall_NoSSE_SaveList);
+ }
case CallingConv::Cold:
if (Is64Bit)
return CSR_64_MostRegs_SaveList;
break;
case CallingConv::X86_64_Win64:
+ if (!HasSSE)
+ return CSR_Win64_NoSSE_SaveList;
return CSR_Win64_SaveList;
case CallingConv::X86_64_SysV:
if (CallsEHReturn)
@@ -313,8 +352,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
}
if (Is64Bit) {
- if (IsWin64)
+ if (IsWin64) {
+ if (!HasSSE)
+ return CSR_Win64_NoSSE_SaveList;
return CSR_Win64_SaveList;
+ }
if (CallsEHReturn)
return CSR_64EHRet_SaveList;
if (Subtarget.getTargetLowering()->supportSwiftError() &&
@@ -378,6 +420,19 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
}
case CallingConv::HHVM:
return CSR_64_HHVM_RegMask;
+ case CallingConv::X86_RegCall:
+ if (Is64Bit) {
+ if (IsWin64) {
+ return (HasSSE ? CSR_Win64_RegCall_RegMask :
+ CSR_Win64_RegCall_NoSSE_RegMask);
+ } else {
+ return (HasSSE ? CSR_SysV64_RegCall_RegMask :
+ CSR_SysV64_RegCall_NoSSE_RegMask);
+ }
+ } else {
+ return (HasSSE ? CSR_32_RegCall_RegMask :
+ CSR_32_RegCall_NoSSE_RegMask);
+ }
case CallingConv::Cold:
if (Is64Bit)
return CSR_64_MostRegs_RegMask;
@@ -503,6 +558,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
}
+ assert(checkAllSuperRegsMarked(Reserved,
+ {X86::SIL, X86::DIL, X86::BPL, X86::SPL}));
return Reserved;
}
@@ -526,12 +583,12 @@ void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
// Stack Frame Processing methods
//===----------------------------------------------------------------------===//
-static bool CantUseSP(const MachineFrameInfo *MFI) {
- return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment();
+static bool CantUseSP(const MachineFrameInfo &MFI) {
+ return MFI.hasVarSizedObjects() || MFI.hasOpaqueSPAdjustment();
}
bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
if (!EnableBasePointer)
return false;
@@ -549,7 +606,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
if (!TargetRegisterInfo::canRealignStack(MF))
return false;
- const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
// Stack realignment requires a frame pointer. If we already started
@@ -571,6 +628,35 @@ bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
llvm_unreachable("Unused function on X86. Otherwise need a test case.");
}
+// tryOptimizeLEAtoMOV - helper function that tries to replace a LEA instruction
+// of the form 'lea (%esp), %ebx' --> 'mov %esp, %ebx'.
+// TODO: In this case we should be really trying first to entirely eliminate
+// this instruction which is a plain copy.
+static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
+ MachineInstr &MI = *II;
+ unsigned Opc = II->getOpcode();
+ // Check if this is a LEA of the form 'lea (%esp), %ebx'
+ if ((Opc != X86::LEA32r && Opc != X86::LEA64r && Opc != X86::LEA64_32r) ||
+ MI.getOperand(2).getImm() != 1 ||
+ MI.getOperand(3).getReg() != X86::NoRegister ||
+ MI.getOperand(4).getImm() != 0 ||
+ MI.getOperand(5).getReg() != X86::NoRegister)
+ return false;
+ unsigned BasePtr = MI.getOperand(1).getReg();
+ // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will
+ // be replaced with a 32-bit operand MOV which will zero extend the upper
+ // 32-bits of the super register.
+ if (Opc == X86::LEA64_32r)
+ BasePtr = getX86SubSuperRegister(BasePtr, 32);
+ unsigned NewDestReg = MI.getOperand(0).getReg();
+ const X86InstrInfo *TII =
+ MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo();
+ TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr,
+ MI.getOperand(1).isKill());
+ MI.eraseFromParent();
+ return true;
+}
+
void
X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
@@ -611,19 +697,21 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit
// register as source operand, semantic is the same and destination is
// 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
+ // Don't change BasePtr since it is used later for stack adjustment.
+ unsigned MachineBasePtr = BasePtr;
if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
- BasePtr = getX86SubSuperRegister(BasePtr, 64);
+ MachineBasePtr = getX86SubSuperRegister(BasePtr, 64);
// This must be part of a four operand memory reference. Replace the
- // FrameIndex with base register with EBP. Add an offset to the offset.
- MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+ // FrameIndex with base register. Add an offset to the offset.
+ MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false);
// Now add the frame object offset to the offset from EBP.
int FIOffset;
if (AfterFPPop) {
// Tail call jmp happens after FP is popped.
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ FIOffset = MFI.getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
} else
FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
@@ -645,7 +733,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int Offset = FIOffset + Imm;
assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
"Requesting 64-bit offset in 32-bit immediate!");
- MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+ if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
+ MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
} else {
// Offset is symbolic. This is extremely rare.
uint64_t Offset = FIOffset +
@@ -667,13 +756,3 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
FrameReg = getX86SubSuperRegister(FrameReg, 32);
return FrameReg;
}
-
-unsigned llvm::get512BitSuperRegister(unsigned Reg) {
- if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
- return X86::ZMM0 + (Reg - X86::XMM0);
- if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
- return X86::ZMM0 + (Reg - X86::YMM0);
- if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
- return Reg;
- llvm_unreachable("Unexpected SIMD register");
-}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
index 8d0094c..58fa31e 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -100,7 +100,7 @@ public:
const MCPhysReg *
getCalleeSavedRegs(const MachineFunction* MF) const override;
const MCPhysReg *
- getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
+ getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
const uint32_t *getNoPreservedMask() const override;
@@ -137,9 +137,6 @@ public:
unsigned getSlotSize() const { return SlotSize; }
};
-//get512BitRegister - X86 utility - returns 512-bit super register
-unsigned get512BitSuperRegister(unsigned Reg);
-
} // End llvm namespace
#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
index 373f9b4..372a15a 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -345,6 +345,8 @@ def GR32 : RegisterClass<"X86", [i32], 32,
// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
// RIP isn't really a register and it can't be used anywhere except in an
// address, but it doesn't cause trouble.
+// FIXME: it *does* cause trouble - CheckBaseRegAndIndexReg() has extra
+// tests because of the inclusion of RIP in this register class.
def GR64 : RegisterClass<"X86", [i64], 64,
(add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index d02859b..f031a28 100644
--- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -31,8 +31,8 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
// alignment requirements. Fall back to generic code if there are any
// dynamic stack adjustments (hopefully rare) and the base pointer would
// conflict if we had to use it.
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- if (!MFI->hasVarSizedObjects() && !MFI->hasOpaqueSPAdjustment())
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
return false;
const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 1adc92c..1111552 100644
--- a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -14,6 +14,7 @@
#include "X86ShuffleDecodeConstantPool.h"
#include "Utils/X86ShuffleDecode.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/IR/Constants.h"
@@ -23,10 +24,12 @@
namespace llvm {
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
- Type *MaskTy = C->getType();
- // It is not an error for the PSHUFB mask to not be a vector of i8 because the
- // constant pool uniques constants by their bit representation.
+static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
+ SmallBitVector &UndefElts,
+ SmallVectorImpl<uint64_t> &RawMask) {
+ // It is not an error for shuffle masks to not be a vector of
+ // MaskEltSizeInBits because the constant pool uniques constants by their
+ // bit representation.
// e.g. the following take up the same space in the constant pool:
// i128 -170141183420855150465331762880109871104
//
@@ -34,165 +37,161 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
//
// <4 x i32> <i32 -2147483648, i32 -2147483648,
// i32 -2147483648, i32 -2147483648>
+ Type *CstTy = C->getType();
+ if (!CstTy->isVectorTy())
+ return false;
+
+ Type *CstEltTy = CstTy->getVectorElementType();
+ if (!CstEltTy->isIntegerTy())
+ return false;
+
+ unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+ unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumCstElts = CstTy->getVectorNumElements();
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(CstSizeInBits, 0);
+ APInt MaskBits(CstSizeInBits, 0);
+ for (unsigned i = 0; i != NumCstElts; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return false;
-#ifndef NDEBUG
- unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
- assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512);
-#endif
+ if (isa<UndefValue>(COp)) {
+ APInt EltUndef = APInt::getLowBitsSet(CstSizeInBits, CstEltSizeInBits);
+ UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
+ continue;
+ }
- if (!MaskTy->isVectorTy())
- return;
- int NumElts = MaskTy->getVectorNumElements();
+ APInt EltBits = cast<ConstantInt>(COp)->getValue();
+ EltBits = EltBits.zextOrTrunc(CstSizeInBits);
+ MaskBits |= EltBits.shl(i * CstEltSizeInBits);
+ }
- Type *EltTy = MaskTy->getVectorElementType();
- if (!EltTy->isIntegerTy())
- return;
+ // Now extract the undef/constant bit data into the raw shuffle masks.
+ assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
+ "Unaligned shuffle mask size");
- // The shuffle mask requires a byte vector - decode cases with
- // wider elements as well.
- unsigned BitWidth = cast<IntegerType>(EltTy)->getBitWidth();
- if ((BitWidth % 8) != 0)
+ unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
+ UndefElts = SmallBitVector(NumMaskElts, false);
+ RawMask.resize(NumMaskElts, 0);
+
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ APInt EltUndef = UndefBits.lshr(i * MaskEltSizeInBits);
+ EltUndef = EltUndef.zextOrTrunc(MaskEltSizeInBits);
+
+ // Only treat the element as UNDEF if all bits are UNDEF, otherwise
+ // treat it as zero.
+ if (EltUndef.isAllOnesValue()) {
+ UndefElts[i] = true;
+ RawMask[i] = 0;
+ continue;
+ }
+
+ APInt EltBits = MaskBits.lshr(i * MaskEltSizeInBits);
+ EltBits = EltBits.zextOrTrunc(MaskEltSizeInBits);
+ RawMask[i] = EltBits.getZExtValue();
+ }
+
+ return true;
+}
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+ "Unexpected vector size.");
+
+ // The shuffle mask requires a byte vector.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 32> RawMask;
+ if (!extractConstantMask(C, 8, UndefElts, RawMask))
return;
- int Scale = BitWidth / 8;
- int NumBytes = NumElts * Scale;
- ShuffleMask.reserve(NumBytes);
+ unsigned NumElts = RawMask.size();
+ assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+ "Unexpected number of vector elements.");
- for (int i = 0; i != NumElts; ++i) {
- Constant *COp = C->getAggregateElement(i);
- if (!COp) {
- ShuffleMask.clear();
- return;
- } else if (isa<UndefValue>(COp)) {
- ShuffleMask.append(Scale, SM_SentinelUndef);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
continue;
}
- APInt APElt = cast<ConstantInt>(COp)->getValue();
- for (int j = 0; j != Scale; ++j) {
+ uint64_t Element = RawMask[i];
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (Element & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
// For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
// lane of the vector we're inside.
- int Base = ((i * Scale) + j) & ~0xf;
-
- uint64_t Element = APElt.getLoBits(8).getZExtValue();
- APElt = APElt.lshr(8);
-
- // If the high bit (7) of the byte is set, the element is zeroed.
- if (Element & (1 << 7))
- ShuffleMask.push_back(SM_SentinelZero);
- else {
- // Only the least significant 4 bits of the byte are used.
- int Index = Base + (Element & 0xf);
- ShuffleMask.push_back(Index);
- }
+ unsigned Base = i & ~0xf;
+
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (Element & 0xf);
+ ShuffleMask.push_back(Index);
}
}
-
- assert(NumBytes == (int)ShuffleMask.size() && "Unexpected shuffle mask size");
}
void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
SmallVectorImpl<int> &ShuffleMask) {
Type *MaskTy = C->getType();
- // It is not an error for the PSHUFB mask to not be a vector of i8 because the
- // constant pool uniques constants by their bit representation.
- // e.g. the following take up the same space in the constant pool:
- // i128 -170141183420855150465331762880109871104
- //
- // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
- //
- // <4 x i32> <i32 -2147483648, i32 -2147483648,
- // i32 -2147483648, i32 -2147483648>
-
- if (ElSize != 32 && ElSize != 64)
- return;
-
unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
- if (MaskTySize != 128 && MaskTySize != 256 && MaskTySize != 512)
- return;
-
- // Only support vector types.
- if (!MaskTy->isVectorTy())
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+ "Unexpected vector size.");
+ assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
return;
- // Make sure its an integer type.
- Type *VecEltTy = MaskTy->getVectorElementType();
- if (!VecEltTy->isIntegerTy())
- return;
-
- // Support any element type from byte up to element size.
- // This is necessary primarily because 64-bit elements get split to 32-bit
- // in the constant pool on 32-bit target.
- unsigned EltTySize = VecEltTy->getIntegerBitWidth();
- if (EltTySize < 8 || EltTySize > ElSize)
- return;
-
- unsigned NumElements = MaskTySize / ElSize;
- assert((NumElements == 2 || NumElements == 4 || NumElements == 8 ||
- NumElements == 16) &&
+ unsigned NumElts = RawMask.size();
+ unsigned NumEltsPerLane = 128 / ElSize;
+ assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
"Unexpected number of vector elements.");
- ShuffleMask.reserve(NumElements);
- unsigned NumElementsPerLane = 128 / ElSize;
- unsigned Factor = ElSize / EltTySize;
- for (unsigned i = 0; i < NumElements; ++i) {
- Constant *COp = C->getAggregateElement(i * Factor);
- if (!COp) {
- ShuffleMask.clear();
- return;
- } else if (isa<UndefValue>(COp)) {
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
ShuffleMask.push_back(SM_SentinelUndef);
continue;
}
- int Index = i & ~(NumElementsPerLane - 1);
- uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+
+ int Index = i & ~(NumEltsPerLane - 1);
+ uint64_t Element = RawMask[i];
if (ElSize == 64)
Index += (Element >> 1) & 0x1;
else
Index += Element & 0x3;
+
ShuffleMask.push_back(Index);
}
-
- // TODO: Handle funny-looking vectors too.
}
void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
SmallVectorImpl<int> &ShuffleMask) {
Type *MaskTy = C->getType();
-
unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
- if (MaskTySize != 128 && MaskTySize != 256)
- return;
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
- // Only support vector types.
- if (!MaskTy->isVectorTy())
+ // The shuffle mask requires elements the same size as the target.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
return;
- // Make sure its an integer type.
- Type *VecEltTy = MaskTy->getVectorElementType();
- if (!VecEltTy->isIntegerTy())
- return;
-
- // Support any element type from byte up to element size.
- // This is necessary primarily because 64-bit elements get split to 32-bit
- // in the constant pool on 32-bit target.
- unsigned EltTySize = VecEltTy->getIntegerBitWidth();
- if (EltTySize < 8 || EltTySize > ElSize)
- return;
-
- unsigned NumElements = MaskTySize / ElSize;
- assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+ unsigned NumElts = RawMask.size();
+ unsigned NumEltsPerLane = 128 / ElSize;
+ assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
"Unexpected number of vector elements.");
- ShuffleMask.reserve(NumElements);
- unsigned NumElementsPerLane = 128 / ElSize;
- unsigned Factor = ElSize / EltTySize;
- for (unsigned i = 0; i < NumElements; ++i) {
- Constant *COp = C->getAggregateElement(i * Factor);
- if (!COp) {
- ShuffleMask.clear();
- return;
- } else if (isa<UndefValue>(COp)) {
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
ShuffleMask.push_back(SM_SentinelUndef);
continue;
}
@@ -201,7 +200,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
// Bits[3] - Match Bit.
// Bits[2:1] - (Per Lane) PD Shuffle Mask.
// Bits[2:0] - (Per Lane) PS Shuffle Mask.
- uint64_t Selector = cast<ConstantInt>(COp)->getZExtValue();
+ uint64_t Selector = RawMask[i];
unsigned MatchBit = (Selector >> 3) & 0x1;
// M2Z[0:1] MatchBit
@@ -215,51 +214,34 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
continue;
}
- int Index = i & ~(NumElementsPerLane - 1);
+ int Index = i & ~(NumEltsPerLane - 1);
if (ElSize == 64)
Index += (Selector >> 1) & 0x1;
else
Index += Selector & 0x3;
int Src = (Selector >> 2) & 0x1;
- Index += Src * NumElements;
+ Index += Src * NumElts;
ShuffleMask.push_back(Index);
}
-
- // TODO: Handle funny-looking vectors too.
}
void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
- Type *MaskTy = C->getType();
- assert(MaskTy->getPrimitiveSizeInBits() == 128);
-
- // Only support vector types.
- if (!MaskTy->isVectorTy())
- return;
-
- // Make sure its an integer type.
- Type *VecEltTy = MaskTy->getVectorElementType();
- if (!VecEltTy->isIntegerTy())
- return;
+ assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
+ "Unexpected vector size.");
- // The shuffle mask requires a byte vector - decode cases with
- // wider elements as well.
- unsigned BitWidth = cast<IntegerType>(VecEltTy)->getBitWidth();
- if ((BitWidth % 8) != 0)
+ // The shuffle mask requires a byte vector.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 32> RawMask;
+ if (!extractConstantMask(C, 8, UndefElts, RawMask))
return;
- int NumElts = MaskTy->getVectorNumElements();
- int Scale = BitWidth / 8;
- int NumBytes = NumElts * Scale;
- ShuffleMask.reserve(NumBytes);
+ unsigned NumElts = RawMask.size();
+ assert(NumElts == 16 && "Unexpected number of vector elements.");
- for (int i = 0; i != NumElts; ++i) {
- Constant *COp = C->getAggregateElement(i);
- if (!COp) {
- ShuffleMask.clear();
- return;
- } else if (isa<UndefValue>(COp)) {
- ShuffleMask.append(Scale, SM_SentinelUndef);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
continue;
}
@@ -275,82 +257,77 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
// 4 - 00h (zero - fill).
// 5 - FFh (ones - fill).
// 6 - Most significant bit of source byte replicated in all bit positions.
- // 7 - Invert most significant bit of source byte and replicate in all bit positions.
- APInt MaskElt = cast<ConstantInt>(COp)->getValue();
- for (int j = 0; j != Scale; ++j) {
- APInt Index = MaskElt.getLoBits(5);
- APInt PermuteOp = MaskElt.lshr(5).getLoBits(3);
- MaskElt = MaskElt.lshr(8);
-
- if (PermuteOp == 4) {
- ShuffleMask.push_back(SM_SentinelZero);
- continue;
- }
- if (PermuteOp != 0) {
- ShuffleMask.clear();
- return;
- }
- ShuffleMask.push_back((int)Index.getZExtValue());
+ // 7 - Invert most significant bit of source byte and replicate in all bit
+ // positions.
+ uint64_t Element = RawMask[i];
+ uint64_t Index = Element & 0x1F;
+ uint64_t PermuteOp = (Element >> 5) & 0x7;
+
+ if (PermuteOp == 4) {
+ ShuffleMask.push_back(SM_SentinelZero);
+ continue;
+ }
+ if (PermuteOp != 0) {
+ ShuffleMask.clear();
+ return;
}
+ ShuffleMask.push_back((int)Index);
}
-
- assert(NumBytes == (int)ShuffleMask.size() && "Unexpected shuffle mask size");
}
-void DecodeVPERMVMask(const Constant *C, MVT VT,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
SmallVectorImpl<int> &ShuffleMask) {
Type *MaskTy = C->getType();
- if (MaskTy->isVectorTy()) {
- unsigned NumElements = MaskTy->getVectorNumElements();
- if (NumElements == VT.getVectorNumElements()) {
- unsigned EltMaskSize = Log2_64(NumElements);
- for (unsigned i = 0; i < NumElements; ++i) {
- Constant *COp = C->getAggregateElement(i);
- if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) {
- ShuffleMask.clear();
- return;
- }
- if (isa<UndefValue>(COp))
- ShuffleMask.push_back(SM_SentinelUndef);
- else {
- APInt Element = cast<ConstantInt>(COp)->getValue();
- Element = Element.getLoBits(EltMaskSize);
- ShuffleMask.push_back(Element.getZExtValue());
- }
- }
- }
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+ "Unexpected vector size.");
+ assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
+ "Unexpected vector element size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
return;
+
+ unsigned NumElts = RawMask.size();
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ int Index = RawMask[i] & (NumElts - 1);
+ ShuffleMask.push_back(Index);
}
- // Scalar value; just broadcast it
- if (!isa<ConstantInt>(C))
- return;
- uint64_t Element = cast<ConstantInt>(C)->getZExtValue();
- int NumElements = VT.getVectorNumElements();
- Element &= (1 << NumElements) - 1;
- for (int i = 0; i < NumElements; ++i)
- ShuffleMask.push_back(Element);
}
-void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
SmallVectorImpl<int> &ShuffleMask) {
Type *MaskTy = C->getType();
- unsigned NumElements = MaskTy->getVectorNumElements();
- if (NumElements == VT.getVectorNumElements()) {
- unsigned EltMaskSize = Log2_64(NumElements * 2);
- for (unsigned i = 0; i < NumElements; ++i) {
- Constant *COp = C->getAggregateElement(i);
- if (!COp) {
- ShuffleMask.clear();
- return;
- }
- if (isa<UndefValue>(COp))
- ShuffleMask.push_back(SM_SentinelUndef);
- else {
- APInt Element = cast<ConstantInt>(COp)->getValue();
- Element = Element.getLoBits(EltMaskSize);
- ShuffleMask.push_back(Element.getZExtValue());
- }
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ (void)MaskTySize;
+ assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+ "Unexpected vector size.");
+ assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
+ "Unexpected vector element size.");
+
+ // The shuffle mask requires elements the same size as the target.
+ SmallBitVector UndefElts;
+ SmallVector<uint64_t, 8> RawMask;
+ if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+ return;
+
+ unsigned NumElts = RawMask.size();
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
}
+ int Index = RawMask[i] & (NumElts*2 - 1);
+ ShuffleMask.push_back(Index);
}
}
} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index d2565b8..b703cbb 100644
--- a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -40,11 +40,11 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
/// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, MVT VT,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
SmallVectorImpl<int> &ShuffleMask);
} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
index 8f77682..586bb7b 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -92,6 +92,10 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
if (TM.getCodeModel() == CodeModel::Large)
return X86II::MO_NO_FLAG;
+ // Absolute symbols can be referenced directly.
+ if (GV && GV->isAbsoluteSymbolRef())
+ return X86II::MO_NO_FLAG;
+
if (TM.shouldAssumeDSOLocal(M, GV))
return classifyLocalReference(GV);
@@ -275,6 +279,7 @@ void X86Subtarget::initializeEnvironment() {
HasMWAITX = false;
HasMPX = false;
IsBTMemSlow = false;
+ IsPMULLDSlow = false;
IsSHLDSlow = false;
IsUAMem16Slow = false;
IsUAMem32Slow = false;
@@ -282,6 +287,9 @@ void X86Subtarget::initializeEnvironment() {
HasCmpxchg16b = false;
UseLeaForSP = false;
HasFastPartialYMMWrite = false;
+ HasFastScalarFSQRT = false;
+ HasFastVectorFSQRT = false;
+ HasFastLZCNT = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;
@@ -328,6 +336,26 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
setPICStyle(PICStyles::GOT);
}
+const CallLowering *X86Subtarget::getCallLowering() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getCallLowering();
+}
+
+const InstructionSelector *X86Subtarget::getInstructionSelector() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getInstructionSelector();
+}
+
+const LegalizerInfo *X86Subtarget::getLegalizerInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getLegalizerInfo();
+}
+
+const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
+ assert(GISel && "Access to GlobalISel APIs not set");
+ return GISel->getRegBankInfo();
+}
+
bool X86Subtarget::enableEarlyIfConversion() const {
return hasCMov() && X86EarlyIfConv;
}
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
index a274b79..d80dc4a 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -19,6 +19,7 @@
#include "X86InstrInfo.h"
#include "X86SelectionDAGInfo.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <string>
@@ -177,6 +178,10 @@ protected:
/// True if SHLD instructions are slow.
bool IsSHLDSlow;
+ /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
+ // PMULUDQ.
+ bool IsPMULLDSlow;
+
/// True if unaligned memory accesses of 16-bytes are slow.
bool IsUAMem16Slow;
@@ -199,14 +204,25 @@ protected:
/// of a YMM register without clearing the upper part.
bool HasFastPartialYMMWrite;
+ /// True if hardware SQRTSS instruction is at least as fast (latency) as
+ /// RSQRTSS followed by a Newton-Raphson iteration.
+ bool HasFastScalarFSQRT;
+
+ /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+ /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+ bool HasFastVectorFSQRT;
+
/// True if 8-bit divisions are significantly faster than
/// 32-bit divisions and should be used when possible.
bool HasSlowDivide32;
- /// True if 16-bit divides are significantly faster than
+ /// True if 32-bit divides are significantly faster than
/// 64-bit divisions and should be used when possible.
bool HasSlowDivide64;
+ /// True if LZCNT instruction is fast.
+ bool HasFastLZCNT;
+
/// True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions;
@@ -287,6 +303,10 @@ protected:
/// Instruction itineraries for scheduling
InstrItineraryData InstrItins;
+ /// Gather the accessor points to GlobalISel-related APIs.
+ /// This is used to avoid ifndefs spreading around while GISel is
+ /// an optional library.
+ std::unique_ptr<GISelAccessor> GISel;
private:
/// Override the stack alignment.
@@ -315,6 +335,9 @@ public:
X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM, unsigned StackAlignOverride);
+ /// This object will take onwership of \p GISelAccessor.
+ void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); }
+
const X86TargetLowering *getTargetLowering() const override {
return &TLInfo;
}
@@ -342,6 +365,11 @@ public:
/// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ /// Methods used by Global ISel
+ const CallLowering *getCallLowering() const override;
+ const InstructionSelector *getInstructionSelector() const override;
+ const LegalizerInfo *getLegalizerInfo() const override;
+ const RegisterBankInfo *getRegBankInfo() const override;
private:
/// Initialize the full set of dependencies so we can use an initializer
/// list for X86Subtarget.
@@ -428,12 +456,16 @@ public:
bool hasMWAITX() const { return HasMWAITX; }
bool isBTMemSlow() const { return IsBTMemSlow; }
bool isSHLDSlow() const { return IsSHLDSlow; }
+ bool isPMULLDSlow() const { return IsPMULLDSlow; }
bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+ bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
+ bool hasFastLZCNT() const { return HasFastLZCNT; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
@@ -450,6 +482,8 @@ public:
bool hasPKU() const { return HasPKU; }
bool hasMPX() const { return HasMPX; }
+ virtual bool isXRaySupported() const override { return is64Bit(); }
+
bool isAtom() const { return X86ProcFamily == IntelAtom; }
bool isSLM() const { return X86ProcFamily == IntelSLM; }
bool useSoftFloat() const { return UseSoftFloat; }
@@ -465,7 +499,7 @@ public:
bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
- bool isTargetPS4() const { return TargetTriple.isPS4(); }
+ bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
index 50c9c25..aa5cfc6 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -13,8 +13,12 @@
#include "X86TargetMachine.h"
#include "X86.h"
+#include "X86CallLowering.h"
#include "X86TargetObjectFile.h"
#include "X86TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
@@ -35,12 +39,14 @@ void initializeWinEHStatePassPass(PassRegistry &);
extern "C" void LLVMInitializeX86Target() {
// Register the target.
- RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
- RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
+ RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
+ RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
+ initializeEvexToVexInstPassPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -50,8 +56,12 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return make_unique<TargetLoweringObjectFileMachO>();
}
+ if (TT.isOSFreeBSD())
+ return make_unique<X86FreeBSDTargetObjectFile>();
if (TT.isOSLinux() || TT.isOSNaCl())
return make_unique<X86LinuxNaClTargetObjectFile>();
+ if (TT.isOSFuchsia())
+ return make_unique<X86FuchsiaTargetObjectFile>();
if (TT.isOSBinFormatELF())
return make_unique<X86ELFTargetObjectFile>();
if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
@@ -151,32 +161,47 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
CodeModel::Model CM, CodeGenOpt::Level OL)
: LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
getEffectiveRelocModel(TT, RM), CM, OL),
- TLOF(createTLOF(getTargetTriple())),
- Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
+ TLOF(createTLOF(getTargetTriple())) {
// Windows stack unwinder gets confused when execution flow "falls through"
// after a call to 'noreturn' function.
// To prevent that, we emit a trap for 'unreachable' IR instructions.
// (which on X86, happens to be the 'ud2' instruction)
// On PS4, the "return address" of a 'noreturn' call must still be within
// the calling function, and TrapUnreachable is an easy way to get that.
- if (Subtarget.isTargetWin64() || Subtarget.isTargetPS4())
+ // The check here for 64-bit windows is a bit icky, but as we're unlikely
+ // to ever want to mix 32 and 64-bit windows code in a single module
+ // this should be fine.
+ if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4())
this->Options.TrapUnreachable = true;
- // By default (and when -ffast-math is on), enable estimate codegen for
- // everything except scalar division. By default, use 1 refinement step for
- // all operations. Defaults may be overridden by using command-line options.
- // Scalar division estimates are disabled because they break too much
- // real-world code. These defaults match GCC behavior.
- this->Options.Reciprocals.setDefaults("sqrtf", true, 1);
- this->Options.Reciprocals.setDefaults("divf", false, 1);
- this->Options.Reciprocals.setDefaults("vec-sqrtf", true, 1);
- this->Options.Reciprocals.setDefaults("vec-divf", true, 1);
-
initAsmInfo();
}
X86TargetMachine::~X86TargetMachine() {}
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct X86GISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<CallLowering> CL;
+ X86GISelActualAccessor(CallLowering* CL): CL(CL) {}
+ const CallLowering *getCallLowering() const override {
+ return CL.get();
+ }
+ const InstructionSelector *getInstructionSelector() const override {
+ //TODO: Implement
+ return nullptr;
+ }
+ const LegalizerInfo *getLegalizerInfo() const override {
+ //TODO: Implement
+ return nullptr;
+ }
+ const RegisterBankInfo *getRegBankInfo() const override {
+ //TODO: Implement
+ return nullptr;
+ }
+};
+} // End anonymous namespace.
+#endif
const X86Subtarget *
X86TargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -216,6 +241,13 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
resetTargetOptions(F);
I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
Options.StackAlignmentOverride);
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ X86GISelActualAccessor *GISel = new X86GISelActualAccessor(
+ new X86CallLowering(*I->getTargetLowering()));
+#endif
+ I->setGISelAccessor(*GISel);
}
return I.get();
}
@@ -254,9 +286,22 @@ public:
return getTM<X86TargetMachine>();
}
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override {
+ ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+ DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+ return DAG;
+ }
+
void addIRPasses() override;
bool addInstSelector() override;
- bool addILPOpts() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+ bool addIRTranslator() override;
+ bool addLegalizeMachineIR() override;
+ bool addRegBankSelect() override;
+ bool addGlobalInstructionSelect() override;
+#endif
+bool addILPOpts() override;
bool addPreISel() override;
void addPreRegAlloc() override;
void addPostRegAlloc() override;
@@ -273,6 +318,9 @@ void X86PassConfig::addIRPasses() {
addPass(createAtomicExpandPass(&getX86TargetMachine()));
TargetPassConfig::addIRPasses();
+
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createInterleavedAccessPass(TM));
}
bool X86PassConfig::addInstSelector() {
@@ -288,6 +336,28 @@ bool X86PassConfig::addInstSelector() {
return false;
}
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool X86PassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+
+bool X86PassConfig::addLegalizeMachineIR() {
+ //TODO: Implement
+ return false;
+}
+
+bool X86PassConfig::addRegBankSelect() {
+ //TODO: Implement
+ return false;
+}
+
+bool X86PassConfig::addGlobalInstructionSelect() {
+ //TODO: Implement
+ return false;
+}
+#endif
+
bool X86PassConfig::addILPOpts() {
addPass(&EarlyIfConverterID);
if (EnableMachineCombinerPass)
@@ -321,7 +391,7 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
void X86PassConfig::addPreEmitPass() {
if (getOptLevel() != CodeGenOpt::None)
- addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
+ addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass));
if (UseVZeroUpper)
addPass(createX86IssueVZeroUpperPass());
@@ -330,5 +400,6 @@ void X86PassConfig::addPreEmitPass() {
addPass(createX86FixupBWInsts());
addPass(createX86PadShortFunctions());
addPass(createX86FixupLEAs());
+ addPass(createX86EvexToVexInsts());
}
}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
index 4734a44..d756d07 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -24,8 +24,6 @@ class StringRef;
class X86TargetMachine final : public LLVMTargetMachine {
std::unique_ptr<TargetLoweringObjectFile> TLOF;
- X86Subtarget Subtarget;
-
mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
public:
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index d664cff..7f70829 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -24,14 +24,13 @@ using namespace llvm;
using namespace dwarf;
const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
- const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
- const TargetMachine &TM, MachineModuleInfo *MMI,
- MCStreamer &Streamer) const {
+ const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+ MachineModuleInfo *MMI, MCStreamer &Streamer) const {
// On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
// is an indirect pc-relative reference.
if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
- const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+ const MCSymbol *Sym = TM.getSymbol(GV);
const MCExpr *Res =
MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
const MCExpr *Four = MCConstantExpr::create(4, getContext());
@@ -39,13 +38,13 @@ const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
}
return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
- GV, Encoding, Mang, TM, MMI, Streamer);
+ GV, Encoding, TM, MMI, Streamer);
}
MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
- const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+ const GlobalValue *GV, const TargetMachine &TM,
MachineModuleInfo *MMI) const {
- return TM.getSymbol(GV, Mang);
+ return TM.getSymbol(GV);
}
const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
@@ -67,6 +66,20 @@ const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
}
void
+X86FreeBSDTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
+void
+X86FuchsiaTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
+void
X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
const TargetMachine &TM) {
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
@@ -74,7 +87,7 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
}
const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
- const GlobalValue *LHS, const GlobalValue *RHS, Mangler &Mang,
+ const GlobalValue *LHS, const GlobalValue *RHS,
const TargetMachine &TM) const {
// Our symbols should exist in address space zero, cowardly no-op if
// otherwise.
@@ -95,8 +108,9 @@ const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection())
return nullptr;
- return MCSymbolRefExpr::create(
- TM.getSymbol(LHS, Mang), MCSymbolRefExpr::VK_COFF_IMGREL32, getContext());
+ return MCSymbolRefExpr::create(TM.getSymbol(LHS),
+ MCSymbolRefExpr::VK_COFF_IMGREL32,
+ getContext());
}
static std::string APIntToHexString(const APInt &AI) {
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
index 2e703f1..39d2e84 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -19,15 +19,15 @@ namespace llvm {
/// x86-64.
class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
public:
- const MCExpr *
- getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding,
- Mangler &Mang, const TargetMachine &TM,
- MachineModuleInfo *MMI,
- MCStreamer &Streamer) const override;
+ const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+ unsigned Encoding,
+ const TargetMachine &TM,
+ MachineModuleInfo *MMI,
+ MCStreamer &Streamer) const override;
// getCFIPersonalitySymbol - The symbol that gets passed to
// .cfi_personality.
- MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+ MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV,
const TargetMachine &TM,
MachineModuleInfo *MMI) const override;
@@ -49,6 +49,17 @@ namespace llvm {
const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
};
+ /// X86FreeBSDTargetObjectFile - This implementation is used for FreeBSD
+ /// on x86 and x86-64.
+ class X86FreeBSDTargetObjectFile : public X86ELFTargetObjectFile {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+ };
+
+ /// \brief This implementation is used for Fuchsia on x86-64.
+ class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+ };
+
/// X86LinuxNaClTargetObjectFile - This implementation is used for linux and
/// Native Client on x86 and x86-64.
class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile {
@@ -59,7 +70,6 @@ namespace llvm {
class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
const MCExpr *
lowerRelativeReference(const GlobalValue *LHS, const GlobalValue *RHS,
- Mangler &Mang,
const TargetMachine &TM) const override;
/// \brief Given a mergeable constant with the specified size and relocation
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index f44a8c6..5715d82 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -13,6 +13,31 @@
/// independent and default TTI implementations handle the rest.
///
//===----------------------------------------------------------------------===//
+/// About Cost Model numbers used below it's necessary to say the following:
+/// the numbers correspond to some "generic" X86 CPU instead of usage of
+/// concrete CPU model. Usually the numbers correspond to CPU where the feature
+/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
+/// the lookups below the cost is based on Nehalem as that was the first CPU
+/// to support that feature level and thus has most likely the worst case cost.
+/// Some examples of other technologies/CPUs:
+/// SSE 3 - Pentium4 / Athlon64
+/// SSE 4.1 - Penryn
+/// SSE 4.2 - Nehalem
+/// AVX - Sandy Bridge
+/// AVX2 - Haswell
+/// AVX-512 - Xeon Phi / Skylake
+/// And some examples of instruction target dependent costs (latency)
+/// divss sqrtss rsqrtss
+/// AMD K7 11-16 19 3
+/// Piledriver 9-24 13-15 5
+/// Jaguar 14 16 2
+/// Pentium II,III 18 30 2
+/// Nehalem 7-14 7-18 3
+/// Haswell 10-13 11 5
+/// TODO: Develop and implement the target dependent cost model and
+/// specialize cost numbers for different Cost Model Targets such as throughput,
+/// code size, latency and uop count.
+//===----------------------------------------------------------------------===//
#include "X86TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -55,9 +80,12 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
if (Vector) {
- if (ST->hasAVX512()) return 512;
- if (ST->hasAVX()) return 256;
- if (ST->hasSSE1()) return 128;
+ if (ST->hasAVX512())
+ return 512;
+ if (ST->hasAVX())
+ return 256;
+ if (ST->hasSSE1())
+ return 128;
return 0;
}
@@ -86,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
}
int X86TTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry SLMCostTable[] = {
+ { ISD::MUL, MVT::v4i32, 11 }, // pmulld
+ { ISD::MUL, MVT::v8i16, 2 }, // pmullw
+ { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+ { ISD::FMUL, MVT::f64, 2 }, // mulsd
+ { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
+ { ISD::FMUL, MVT::v4f32, 2 }, // mulps
+ { ISD::FDIV, MVT::f32, 17 }, // divss
+ { ISD::FDIV, MVT::v4f32, 39 }, // divps
+ { ISD::FDIV, MVT::f64, 32 }, // divsd
+ { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+ { ISD::FADD, MVT::v2f64, 2 }, // addpd
+ { ISD::FSUB, MVT::v2f64, 2 }, // subpd
+ // v2i64/v4i64 mul is custom lowered as a series of long
+ // multiplies(3), shifts(3) and adds(2).
+ // slm muldq version throughput is 2
+ { ISD::MUL, MVT::v2i64, 11 },
+ };
+
+ if (ST->isSLM()) {
+ if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
+ // Check if the operands can be shrinked into a smaller datatype.
+ bool Op1Signed = false;
+ unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+ bool Op2Signed = false;
+ unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+
+ bool signedMode = Op1Signed | Op2Signed;
+ unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+ if (OpMinSize <= 7)
+ return LT.first * 3; // pmullw/sext
+ if (!signedMode && OpMinSize <= 8)
+ return LT.first * 3; // pmullw/zext
+ if (OpMinSize <= 15)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ if (!signedMode && OpMinSize <= 16)
+ return LT.first * 5; // pmullw/pmulhw/pshuf
+ }
+ if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
+ LT.second)) {
+ return LT.first * Entry->Cost;
+ }
+ }
+
if (ISD == ISD::SDIV &&
Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
@@ -115,7 +190,39 @@ int X86TTIImpl::getArithmeticInstrCost(
return Cost;
}
+ static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+ { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
+ { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasBWI()) {
+ if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512UniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+ { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX512()) {
+ if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
static const CostTblEntry AVX2UniformConstCostTable[] = {
+ { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
+
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
@@ -131,21 +238,136 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
}
+ static const CostTblEntry SSE2UniformConstCostTable[] = {
+ { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand).
+ { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand).
+ { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb).
+
+ { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasSSE2()) {
+ // pmuldq sequence.
+ if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
+ return LT.first * 30;
+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 15;
+
+ if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+ LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i16, 1 }, // psllw.
+ { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
+ { ISD::SRA, MVT::v16i16, 1 }, // psraw.
+ };
+
+ if (ST->hasAVX2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v2i64, 1 }, // psllq.
+
+ { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+
+ { ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ };
+
+ if (ST->hasSSE2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry AVX512DQCostTable[] = {
+ { ISD::MUL, MVT::v2i64, 1 },
+ { ISD::MUL, MVT::v4i64, 1 },
+ { ISD::MUL, MVT::v8i64, 1 }
+ };
+
+ // Look for AVX512DQ lowering tricks for custom cases.
+ if (ST->hasDQI())
+ if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX512BWCostTable[] = {
+ { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
+
+ { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v64i8, 64*20 },
+ { ISD::SDIV, MVT::v32i16, 32*20 },
+ { ISD::UDIV, MVT::v64i8, 64*20 },
+ { ISD::UDIV, MVT::v32i16, 32*20 }
+ };
+
+ // Look for AVX512BW lowering tricks for custom cases.
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry AVX512CostTable[] = {
- { ISD::SHL, MVT::v16i32, 1 },
- { ISD::SRL, MVT::v16i32, 1 },
- { ISD::SRA, MVT::v16i32, 1 },
- { ISD::SHL, MVT::v8i64, 1 },
- { ISD::SRL, MVT::v8i64, 1 },
- { ISD::SRA, MVT::v8i64, 1 },
+ { ISD::SHL, MVT::v16i32, 1 },
+ { ISD::SRL, MVT::v16i32, 1 },
+ { ISD::SRA, MVT::v16i32, 1 },
+ { ISD::SHL, MVT::v8i64, 1 },
+ { ISD::SRL, MVT::v8i64, 1 },
+ { ISD::SRA, MVT::v8i64, 1 },
+
+ { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v16i32, 16*20 },
+ { ISD::SDIV, MVT::v8i64, 8*20 },
+ { ISD::UDIV, MVT::v16i32, 16*20 },
+ { ISD::UDIV, MVT::v8i64, 8*20 }
};
- if (ST->hasAVX512()) {
+ if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
- static const CostTblEntry AVX2CostTable[] = {
+ static const CostTblEntry AVX2ShiftCostTable[] = {
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
{ ISD::SHL, MVT::v4i32, 1 },
@@ -169,11 +391,11 @@ int X86TTIImpl::getArithmeticInstrCost(
// is lowered into a vector multiply (vpmullw).
return LT.first;
- if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
- static const CostTblEntry XOPCostTable[] = {
+ static const CostTblEntry XOPShiftCostTable[] = {
// 128bit shifts take 1cy, but right shifts require negation beforehand.
{ ISD::SHL, MVT::v16i8, 1 },
{ ISD::SRL, MVT::v16i8, 2 },
@@ -203,87 +425,31 @@ int X86TTIImpl::getArithmeticInstrCost(
};
// Look for XOP lowering tricks.
- if (ST->hasXOP()) {
- if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
- return LT.first * Entry->Cost;
- }
-
- static const CostTblEntry AVX2CustomCostTable[] = {
- { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
-
- { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
-
- { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
- { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
-
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v32i8, 32*20 },
- { ISD::SDIV, MVT::v16i16, 16*20 },
- { ISD::SDIV, MVT::v8i32, 8*20 },
- { ISD::SDIV, MVT::v4i64, 4*20 },
- { ISD::UDIV, MVT::v32i8, 32*20 },
- { ISD::UDIV, MVT::v16i16, 16*20 },
- { ISD::UDIV, MVT::v8i32, 8*20 },
- { ISD::UDIV, MVT::v4i64, 4*20 },
- };
-
- // Look for AVX2 lowering tricks for custom cases.
- if (ST->hasAVX2()) {
- if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
- LT.second))
+ if (ST->hasXOP())
+ if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
- static const CostTblEntry
- SSE2UniformConstCostTable[] = {
- // We don't correctly identify costs of casts because they are marked as
- // custom.
- // Constant splats are cheaper for the following instructions.
- { ISD::SHL, MVT::v16i8, 1 }, // psllw.
- { ISD::SHL, MVT::v32i8, 2 }, // psllw.
- { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ static const CostTblEntry SSE2UniformShiftCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i16, 2 }, // psllw.
- { ISD::SHL, MVT::v4i32, 1 }, // pslld
{ ISD::SHL, MVT::v8i32, 2 }, // pslld
- { ISD::SHL, MVT::v2i64, 1 }, // psllq.
{ ISD::SHL, MVT::v4i64, 2 }, // psllq.
- { ISD::SRL, MVT::v16i8, 1 }, // psrlw.
- { ISD::SRL, MVT::v32i8, 2 }, // psrlw.
- { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
{ ISD::SRL, MVT::v16i16, 2 }, // psrlw.
- { ISD::SRL, MVT::v4i32, 1 }, // psrld.
{ ISD::SRL, MVT::v8i32, 2 }, // psrld.
- { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
{ ISD::SRL, MVT::v4i64, 2 }, // psrlq.
- { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
- { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb.
- { ISD::SRA, MVT::v8i16, 1 }, // psraw.
{ ISD::SRA, MVT::v16i16, 2 }, // psraw.
- { ISD::SRA, MVT::v4i32, 1 }, // psrad.
{ ISD::SRA, MVT::v8i32, 2 }, // psrad.
{ ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
-
- { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
- { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
- { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
- { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
};
- if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
- ST->hasSSE2()) {
- // pmuldq sequence.
- if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
- return LT.first * 15;
-
- if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
- LT.second))
+ if (ST->hasSSE2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
@@ -291,60 +457,170 @@ int X86TTIImpl::getArithmeticInstrCost(
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
MVT VT = LT.second;
// Vector shift left by non uniform constant can be lowered
- // into vector multiply (pmullw/pmulld).
- if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
- (VT == MVT::v4i32 && ST->hasSSE41()))
- return LT.first;
-
- // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
- // sequence of extract + two vector multiply + insert.
- if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
- (ST->hasAVX() && !ST->hasAVX2()))
- ISD = ISD::MUL;
-
- // A vector shift left by non uniform constant is converted
- // into a vector multiply; the new multiply is eventually
- // lowered into a sequence of shuffles and 2 x pmuludq.
- if (VT == MVT::v4i32 && ST->hasSSE2())
+ // into vector multiply.
+ if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
+ ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
ISD = ISD::MUL;
}
+ static const CostTblEntry AVX2CostTable[] = {
+ { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+
+ { ISD::SUB, MVT::v32i8, 1 }, // psubb
+ { ISD::ADD, MVT::v32i8, 1 }, // paddb
+ { ISD::SUB, MVT::v16i16, 1 }, // psubw
+ { ISD::ADD, MVT::v16i16, 1 }, // paddw
+ { ISD::SUB, MVT::v8i32, 1 }, // psubd
+ { ISD::ADD, MVT::v8i32, 1 }, // paddd
+ { ISD::SUB, MVT::v4i64, 1 }, // psubq
+ { ISD::ADD, MVT::v4i64, 1 }, // paddq
+
+ { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
+ };
+
+ // Look for AVX2 lowering tricks for custom cases.
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v16i16, 4 },
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v32i8, 4 },
+ { ISD::ADD, MVT::v32i8, 4 },
+ { ISD::SUB, MVT::v16i16, 4 },
+ { ISD::ADD, MVT::v16i16, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // extract+insert in the cost table. Therefore, the cost here is 18
+ // instead of 8.
+ { ISD::MUL, MVT::v4i64, 18 },
+
+ { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
+
+ { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v32i8, 32*20 },
+ { ISD::SDIV, MVT::v16i16, 16*20 },
+ { ISD::SDIV, MVT::v8i32, 8*20 },
+ { ISD::SDIV, MVT::v4i64, 4*20 },
+ { ISD::UDIV, MVT::v32i8, 32*20 },
+ { ISD::UDIV, MVT::v16i16, 16*20 },
+ { ISD::UDIV, MVT::v8i32, 8*20 },
+ { ISD::UDIV, MVT::v4i64, 4*20 },
+ };
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE42CostTable[] = {
+ { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+ };
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE41CostTable[] = {
+ { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld
+
+ { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend.
+
+ { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend.
+
+ { ISD::MUL, MVT::v4i32, 1 } // pmulld
+ };
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
- // For some cases, where the shift amount is a scalar we would be able
- // to generate better code. Unfortunately, when this is the case the value
- // (the splat) will get hoisted out of the loop, thereby making it invisible
- // to ISel. The cost model must return worst case assumptions because it is
- // used for vectorization and we don't want to make vectorized code worse
- // than scalar code.
{ ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
{ ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
{ ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul.
{ ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
{ ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
{ ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
{ ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend.
{ ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
{ ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
{ ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
- { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence.
{ ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence.
{ ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
{ ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
+ { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v8i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
+ { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
// registers. The overhead of division is going to dominate most kernels
@@ -352,61 +628,27 @@ int X86TTIImpl::getArithmeticInstrCost(
// generally a bad idea. Assume somewhat arbitrarily that we have to be able
// to hide "20 cycles" for each lane.
{ ISD::SDIV, MVT::v16i8, 16*20 },
- { ISD::SDIV, MVT::v8i16, 8*20 },
- { ISD::SDIV, MVT::v4i32, 4*20 },
- { ISD::SDIV, MVT::v2i64, 2*20 },
+ { ISD::SDIV, MVT::v8i16, 8*20 },
+ { ISD::SDIV, MVT::v4i32, 4*20 },
+ { ISD::SDIV, MVT::v2i64, 2*20 },
{ ISD::UDIV, MVT::v16i8, 16*20 },
- { ISD::UDIV, MVT::v8i16, 8*20 },
- { ISD::UDIV, MVT::v4i32, 4*20 },
- { ISD::UDIV, MVT::v2i64, 2*20 },
+ { ISD::UDIV, MVT::v8i16, 8*20 },
+ { ISD::UDIV, MVT::v4i32, 4*20 },
+ { ISD::UDIV, MVT::v2i64, 2*20 },
};
- if (ST->hasSSE2()) {
+ if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
- static const CostTblEntry AVX1CostTable[] = {
- // We don't have to scalarize unsupported ops. We can issue two half-sized
- // operations and we only need to extract the upper YMM half.
- // Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v16i16, 4 },
- { ISD::MUL, MVT::v8i32, 4 },
- { ISD::SUB, MVT::v8i32, 4 },
- { ISD::ADD, MVT::v8i32, 4 },
- { ISD::SUB, MVT::v4i64, 4 },
- { ISD::ADD, MVT::v4i64, 4 },
- // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
- // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
- // Because we believe v4i64 to be a legal type, we must also include the
- // split factor of two in the cost table. Therefore, the cost here is 18
- // instead of 9.
- { ISD::MUL, MVT::v4i64, 18 },
+ static const CostTblEntry SSE1CostTable[] = {
+ { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
};
- // Look for AVX1 lowering tricks.
- if (ST->hasAVX() && !ST->hasAVX2()) {
- MVT VT = LT.second;
-
- if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- }
-
- // Custom lowering of vectors.
- static const CostTblEntry CustomLowered[] = {
- // A v2i64/v4i64 and multiply is custom lowered as a series of long
- // multiplies(3), shifts(4) and adds(2).
- { ISD::MUL, MVT::v2i64, 9 },
- { ISD::MUL, MVT::v4i64, 9 },
- };
- if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
- return LT.first * Entry->Cost;
-
- // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
- // 2x pmuludq, 2x shuffle.
- if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
- !ST->hasSSE41())
- return LT.first * 6;
// Fallback to the default implementation.
return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
@@ -414,112 +656,252 @@ int X86TTIImpl::getArithmeticInstrCost(
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
- // We only estimate the cost of reverse and alternate shuffles.
- if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+ // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+ // For Broadcasts we are splatting the first element from the first input
+ // register, so only need to reference that input and all the output
+ // registers are the same.
+ if (Kind == TTI::SK_Broadcast)
+ LT.first = 1;
+
+ // We are going to permute multiple sources and the result will be in multiple
+ // destinations. Providing an accurate cost only for splits where the element
+ // type remains the same.
+ if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
+ MVT LegalVT = LT.second;
+ if (LegalVT.getVectorElementType().getSizeInBits() ==
+ Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+ unsigned VecTySize = DL.getTypeStoreSize(Tp);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ // Number of source vectors after legalization:
+ unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+ // Number of destination vectors after legalization:
+ unsigned NumOfDests = LT.first;
+
+ Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+
+ unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+ return NumOfShuffles *
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+ }
- if (Kind == TTI::SK_Reverse) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
- int Cost = 1;
- if (LT.second.getSizeInBits() > 128)
- Cost = 3; // Extract + insert + copy.
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ }
- // Multiple by the number of parts.
- return Cost * LT.first;
+ // For 2-input shuffles, we must account for splitting the 2 inputs into many.
+ if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
+ // We assume that source and destination have the same vector type.
+ int NumOfDests = LT.first;
+ int NumOfShufflesPerDest = LT.first * 2 - 1;
+ LT.first = NumOfDests * NumOfShufflesPerDest;
}
- if (Kind == TTI::SK_Alternate) {
- // 64-bit packed float vectors (v2f32) are widened to type v4f32.
- // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+ { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
+ { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb
- // The backend knows how to generate a single VEX.256 version of
- // instruction VPBLENDW if the target supports AVX2.
- if (ST->hasAVX2() && LT.second == MVT::v16i16)
- return LT.first;
+ { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb
+ { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb
- static const CostTblEntry AVXAltShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd
- {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd
+ { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b
+ { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b
+ { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b
+ };
+
+ if (ST->hasVBMI())
+ if (const auto *Entry =
+ CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps
- {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps
+ static const CostTblEntry AVX512BWShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb
+
+ { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
+ { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
+ { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
+
+ { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
+ { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
+ { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw
+ { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16
+ { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc
+
+ { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w
+ { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w
+ { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w
+ { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc
+ { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1
+ { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc
+ };
- // This shuffle is custom lowered into a sequence of:
- // 2x vextractf128 , 2x vpblendw , 1x vinsertf128
- {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+ if (ST->hasBWI())
+ if (const auto *Entry =
+ CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- // This shuffle is custom lowered into a long sequence of:
- // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128
- {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
- };
+ static const CostTblEntry AVX512ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd
+ { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps
+ { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq
+ { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd
+
+ { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
+ { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
+ { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
+ { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
+
+ { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd
+ { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
+ { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd
+ { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
+ { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
+ { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps
+ { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq
+ { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
+ { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq
+ { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
+ { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
+ { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd
+ { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
+
+ { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd
+ { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps
+ { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q
+ { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d
+ { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd
+ { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps
+ { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q
+ { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d
+ { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd
+ { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps
+ { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q
+ { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d
+ };
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- static const CostTblEntry SSE41AltShuffleTbl[] = {
- // These are lowered into movsd.
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ static const CostTblEntry AVX2ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd
+ { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps
+ { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq
+ { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd
+ { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
+ { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb
+
+ { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
+ { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
+ { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
+ { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
+ { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
+
+ { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
+ { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb
+ };
- // packed float vectors with four elements are lowered into BLENDI dag
- // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- // This shuffle generates a single pshufw.
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+ static const CostTblEntry AVX1ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
+ { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128
+
+ { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
+ // + vinsertf128
+ { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
+ // + vinsertf128
+
+ { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
+ { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
+ { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
+ { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
+ { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
+ { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor
+ };
- // There is no instruction that matches a v16i8 alternate shuffle.
- // The backend will expand it into the sequence 'pshufb + pshufb + or'.
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
- };
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE41())
- if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
- LT.second))
- return LT.first * Entry->Cost;
+ static const CostTblEntry SSE41ShuffleTbl[] = {
+ { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
+ { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
+ };
- static const CostTblEntry SSSE3AltShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
- // the sequence 'shufps + pshufd'
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ static const CostTblEntry SSSE3ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb
+ { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or
- };
+ { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
+ { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
- if (ST->hasSSSE3())
- if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return LT.first * Entry->Cost;
+ { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por
+ };
- static const CostTblEntry SSEAltShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
+ if (ST->hasSSSE3())
+ if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
+ static const CostTblEntry SSE2ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
+ { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
+ { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
+ { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
+ { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
+
+ { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
+ { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
+ { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
+ { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
+ { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + packus
+
+ { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
+ { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por
+ };
- // This is expanded into a long sequence of four extract + four insert.
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
- // 8 x (pinsrw + pextrw + and + movb + movzb + or)
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
- };
+ static const CostTblEntry SSE1ShuffleTbl[] = {
+ { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
+ };
- // Fall-back (SSE3 and SSE2).
- if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
- }
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
@@ -532,6 +914,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
// potential massive combinations (elem_num x src_type x dst_type).
static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
+
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
@@ -539,12 +928,19 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
- { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
- { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
- { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
- { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
- { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
- { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
};
// TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
@@ -779,6 +1175,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },
+
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
@@ -945,6 +1343,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys, FastMathFlags FMF) {
+ // Costs should match the codegen from:
+ // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
+ // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
+ // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
+ // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
+ // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
{ ISD::BITREVERSE, MVT::v8i32, 4 },
@@ -966,7 +1370,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::BITREVERSE, MVT::v32i8, 5 },
{ ISD::BSWAP, MVT::v4i64, 1 },
{ ISD::BSWAP, MVT::v8i32, 1 },
- { ISD::BSWAP, MVT::v16i16, 1 }
+ { ISD::BSWAP, MVT::v16i16, 1 },
+ { ISD::CTLZ, MVT::v4i64, 23 },
+ { ISD::CTLZ, MVT::v8i32, 18 },
+ { ISD::CTLZ, MVT::v16i16, 14 },
+ { ISD::CTLZ, MVT::v32i8, 9 },
+ { ISD::CTPOP, MVT::v4i64, 7 },
+ { ISD::CTPOP, MVT::v8i32, 11 },
+ { ISD::CTPOP, MVT::v16i16, 9 },
+ { ISD::CTPOP, MVT::v32i8, 6 },
+ { ISD::CTTZ, MVT::v4i64, 10 },
+ { ISD::CTTZ, MVT::v8i32, 14 },
+ { ISD::CTTZ, MVT::v16i16, 12 },
+ { ISD::CTTZ, MVT::v32i8, 9 },
+ { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
};
static const CostTblEntry AVX1CostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 10 },
@@ -975,7 +1397,29 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::BITREVERSE, MVT::v32i8, 10 },
{ ISD::BSWAP, MVT::v4i64, 4 },
{ ISD::BSWAP, MVT::v8i32, 4 },
- { ISD::BSWAP, MVT::v16i16, 4 }
+ { ISD::BSWAP, MVT::v16i16, 4 },
+ { ISD::CTLZ, MVT::v4i64, 46 },
+ { ISD::CTLZ, MVT::v8i32, 36 },
+ { ISD::CTLZ, MVT::v16i16, 28 },
+ { ISD::CTLZ, MVT::v32i8, 18 },
+ { ISD::CTPOP, MVT::v4i64, 14 },
+ { ISD::CTPOP, MVT::v8i32, 22 },
+ { ISD::CTPOP, MVT::v16i16, 18 },
+ { ISD::CTPOP, MVT::v32i8, 12 },
+ { ISD::CTTZ, MVT::v4i64, 20 },
+ { ISD::CTTZ, MVT::v8i32, 28 },
+ { ISD::CTTZ, MVT::v16i16, 24 },
+ { ISD::CTTZ, MVT::v32i8, 18 },
+ { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
+ };
+ static const CostTblEntry SSE42CostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};
static const CostTblEntry SSSE3CostTbl[] = {
{ ISD::BITREVERSE, MVT::v2i64, 5 },
@@ -984,12 +1428,42 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::BITREVERSE, MVT::v16i8, 5 },
{ ISD::BSWAP, MVT::v2i64, 1 },
{ ISD::BSWAP, MVT::v4i32, 1 },
- { ISD::BSWAP, MVT::v8i16, 1 }
+ { ISD::BSWAP, MVT::v8i16, 1 },
+ { ISD::CTLZ, MVT::v2i64, 23 },
+ { ISD::CTLZ, MVT::v4i32, 18 },
+ { ISD::CTLZ, MVT::v8i16, 14 },
+ { ISD::CTLZ, MVT::v16i8, 9 },
+ { ISD::CTPOP, MVT::v2i64, 7 },
+ { ISD::CTPOP, MVT::v4i32, 11 },
+ { ISD::CTPOP, MVT::v8i16, 9 },
+ { ISD::CTPOP, MVT::v16i8, 6 },
+ { ISD::CTTZ, MVT::v2i64, 10 },
+ { ISD::CTTZ, MVT::v4i32, 14 },
+ { ISD::CTTZ, MVT::v8i16, 12 },
+ { ISD::CTTZ, MVT::v16i8, 9 }
};
static const CostTblEntry SSE2CostTbl[] = {
{ ISD::BSWAP, MVT::v2i64, 7 },
{ ISD::BSWAP, MVT::v4i32, 7 },
- { ISD::BSWAP, MVT::v8i16, 7 }
+ { ISD::BSWAP, MVT::v8i16, 7 },
+ { ISD::CTLZ, MVT::v2i64, 25 },
+ { ISD::CTLZ, MVT::v4i32, 26 },
+ { ISD::CTLZ, MVT::v8i16, 20 },
+ { ISD::CTLZ, MVT::v16i8, 17 },
+ { ISD::CTPOP, MVT::v2i64, 12 },
+ { ISD::CTPOP, MVT::v4i32, 15 },
+ { ISD::CTPOP, MVT::v8i16, 13 },
+ { ISD::CTPOP, MVT::v16i8, 10 },
+ { ISD::CTTZ, MVT::v2i64, 14 },
+ { ISD::CTTZ, MVT::v4i32, 18 },
+ { ISD::CTTZ, MVT::v8i16, 16 },
+ { ISD::CTTZ, MVT::v16i8, 13 },
+ { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
+ { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
+ };
+ static const CostTblEntry SSE1CostTbl[] = {
+ { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};
unsigned ISD = ISD::DELETED_NODE;
@@ -1002,6 +1476,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
case Intrinsic::bswap:
ISD = ISD::BSWAP;
break;
+ case Intrinsic::ctlz:
+ ISD = ISD::CTLZ;
+ break;
+ case Intrinsic::ctpop:
+ ISD = ISD::CTPOP;
+ break;
+ case Intrinsic::cttz:
+ ISD = ISD::CTTZ;
+ break;
+ case Intrinsic::sqrt:
+ ISD = ISD::FSQRT;
+ break;
}
// Legalize the type.
@@ -1021,6 +1507,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasSSSE3())
if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -1029,6 +1519,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
}
@@ -1177,17 +1671,29 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
return Cost+LT.first;
}
-int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+ const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
// extra micro-ops can significantly decrease throughput.
unsigned NumVectorInstToHideOverhead = 10;
- if (Ty->isVectorTy() && IsComplex)
- return NumVectorInstToHideOverhead;
+ // Cost modeling of Strided Access Computation is hidden by the indexing
+ // modes of X86 regardless of the stride value. We dont believe that there
+ // is a difference between constant strided access in gerenal and constant
+ // strided value which is less than or equal to 64.
+ // Even in the case of (loop invariant) stride whose value is not known at
+ // compile time, the address computation will not incur more than one extra
+ // ADD instruction.
+ if (Ty->isVectorTy() && SE) {
+ if (!BaseT::isStridedAccess(Ptr))
+ return NumVectorInstToHideOverhead;
+ if (!BaseT::getConstantStrideStep(SE, Ptr))
+ return 1;
+ }
- return BaseT::getAddressComputationCost(Ty, IsComplex);
+ return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
@@ -1352,7 +1858,7 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
// immediates here as the normal path expects bit 31 to be sign extended.
if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
return TTI::TCC_Free;
- // Fallthrough
+ LLVM_FALLTHROUGH;
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
@@ -1556,13 +2062,14 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
// Vector-4 of gather/scatter instruction does not exist on KNL.
// We can extend it to 8 elements, but zeroing upper bits of
// the mask vector will add more instructions. Right now we give the scalar
- // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
- // better in the VariableMask case.
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
+ // is better in the VariableMask case.
if (VF == 2 || (VF == 4 && !ST->hasVLX()))
Scalarize = true;
if (Scalarize)
- return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+ return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
+ AddressSpace);
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}
@@ -1572,8 +2079,8 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
int DataWidth = isa<PointerType>(ScalarTy) ?
DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
- return (DataWidth >= 32 && ST->hasAVX()) ||
- (DataWidth >= 8 && ST->hasBWI());
+ return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
+ ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
}
bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
@@ -1598,7 +2105,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
// AVX-512 allows gather and scatter
- return DataWidth >= 32 && ST->hasAVX512();
+ return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
}
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
@@ -1620,3 +2127,122 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
// correct.
return (CallerBits & CalleeBits) == CalleeBits;
}
+
+bool X86TTIImpl::enableInterleavedAccessVectorization() {
+ // TODO: We expect this to be beneficial regardless of arch,
+ // but there are currently some unexplained performance artifacts on Atom.
+ // As a temporary solution, disable on Atom.
+ return !(ST->isAtom() || ST->isSLM());
+}
+
+// Get estimation for interleaved load/store operations and strided load.
+// \p Indices contains indices for strided load.
+// \p Factor - the factor of interleaving.
+// AVX-512 provides 3-src shuffles that significantly reduces the cost.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost =
+ getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+ if (Opcode == Instruction::Load) {
+ // Kind of shuffle depends on number of loaded values.
+ // If we load the entire data in one register, we can use a 1-src shuffle.
+ // Otherwise, we'll merge 2 sources in each operation.
+ TTI::ShuffleKind ShuffleKind =
+ (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
+
+ unsigned ShuffleCost =
+ getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+
+ unsigned NumOfLoadsInInterleaveGrp =
+ Indices.size() ? Indices.size() : Factor;
+ Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
+ VecTy->getVectorNumElements() / Factor);
+ unsigned NumOfResults =
+ getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
+ NumOfLoadsInInterleaveGrp;
+
+ // About a half of the loads may be folded in shuffles when we have only
+ // one result. If we have more than one result, we do not fold loads at all.
+ unsigned NumOfUnfoldedLoads =
+ NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+
+ // Get a number of shuffle operations per result.
+ unsigned NumOfShufflesPerResult =
+ std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
+
+ // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+ // When we have more than one destination, we need additional instructions
+ // to keep sources.
+ unsigned NumOfMoves = 0;
+ if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
+ NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
+
+ int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+ NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+
+ return Cost;
+ }
+
+ // Store.
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+
+ // There is no strided stores meanwhile. And store can't be folded in
+ // shuffle.
+ unsigned NumOfSources = Factor; // The number of values to be merged.
+ unsigned ShuffleCost =
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+ unsigned NumOfShufflesPerStore = NumOfSources - 1;
+
+ // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+ // We need additional instructions to keep sources.
+ unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
+ int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+ NumOfMoves;
+ return Cost;
+}
+
+int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
+ RequiresBW = false;
+ Type *EltTy = VecTy->getVectorElementType();
+ if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
+ EltTy->isIntegerTy(32) || EltTy->isPointerTy())
+ return true;
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
+ RequiresBW = true;
+ return true;
+ }
+ return false;
+ };
+ bool RequiresBW;
+ bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
+ if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
+ return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
index ab8046b..ecaaf95 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -43,13 +43,6 @@ public:
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
- // Provide value semantics. MSVC requires that we spell all of these out.
- X86TTIImpl(const X86TTIImpl &Arg)
- : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
- X86TTIImpl(X86TTIImpl &&Arg)
- : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
- TLI(std::move(Arg.TLI)) {}
-
/// \name Scalar TTI Implementations
/// @{
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
@@ -67,7 +60,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
@@ -78,7 +72,8 @@ public:
unsigned AddressSpace);
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
bool VariableMask, unsigned Alignment);
- int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+ int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+ const SCEV *Ptr);
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> Tys, FastMathFlags FMF);
@@ -87,6 +82,13 @@ public:
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
+ int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
+
int getIntImmCost(int64_t);
int getIntImmCost(const APInt &Imm, Type *Ty);
@@ -100,6 +102,8 @@ public:
bool isLegalMaskedScatter(Type *DataType);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
+
+ bool enableInterleavedAccessVectorization();
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
unsigned Alignment, unsigned AddressSpace);
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
index 9320e1e..9766b84 100644
--- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -40,9 +40,9 @@ namespace {
bool runOnMachineFunction(MachineFunction &MF) override;
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::AllVRegsAllocated);
+ MachineFunctionProperties::Property::NoVRegs);
}
- const char *getPassName() const override {return "X86 vzeroupper inserter";}
+ StringRef getPassName() const override { return "X86 vzeroupper inserter"; }
private:
diff --git a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
index cc82074..fc08f15 100644
--- a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -63,7 +63,7 @@ private:
unsigned SlotSize;
int64_t StackProbeSize;
- const char *getPassName() const override { return "X86 WinAlloca Expander"; }
+ StringRef getPassName() const override { return "X86 WinAlloca Expander"; }
static char ID;
};
@@ -225,6 +225,7 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
break;
// Fall through to make any remaining adjustment.
+ LLVM_FALLTHROUGH;
case Sub:
assert(Amount > 0);
if (Amount == SlotSize) {
diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
index 99387ed..bc14630 100644
--- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -57,7 +57,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override;
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "Windows 32-bit x86 EH state insertion";
}
OpenPOWER on IntegriCloud